Skip to content

DuckDB Adapter

DuckDBAdapter

Bases: DatabaseAdapter

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
class DuckDBAdapter(DatabaseAdapter):
    def __init__(
        self,
        uri=None,
        embedding_model=None,
        data_models=None,
        stemmer="porter",
        stopwords=None,
        ignore=None,
        strip_accents=None,
        lower=None,
        metric="cosine",
        ef_construction=None,
        ef_search=None,
        M=None,
        M0=None,
        vss_key=VSS_KEY,
        main_table=MAIN_TABLE,
        vector_dim=None,
        wipe_on_start=False,
        name=None,
        encryption_key=None,
        **kwargs,
    ):
        # Resolve URI → backing path. ``None`` defaults to
        # ``{synalinks_home()}/{name or 'database'}.db`` so a no-args
        # KnowledgeBase() persists across processes. The helper also
        # enforces the path-traversal gate on ``name``.
        self.uri = resolve_db_path(uri, scheme="duckdb", extension="db", name=name)

        # Bound as a `?` param to ATTACH; deliberately excluded from get_config().
        self._encryption_key: Optional[str] = encryption_key

        self.embedding_model = _get_em(embedding_model)

        # Vector dimension resolved lazily on the main loop the first time a
        # table is created (see `_ensure_vector_dim`), never via a probe here.
        # The old probe used `run_maybe_nested`, running the embedding on a
        # transient thread-loop and binding litellm's process-global httpx
        # client to a loop closed moments later — poisoning it for every later
        # main-loop embedding ("Event loop is closed"). `vector_dim=` short-
        # circuits.
        self.vector_dim = vector_dim
        # The `FLOAT[dim]` embedding column needs the dimension up front, so when
        # an embedding model is set without an explicit `vector_dim`, defer
        # creating the declared tables until the first `update` (where the dim is
        # learned on the main loop). Only this case is deferred.
        self._defer_table_creation = bool(self.embedding_model and not vector_dim)

        if stemmer not in STEMMERS:
            raise ValueError(f"`stemmer` parameter should be one of {STEMMERS}")
        self.stemmer = stemmer

        if metric not in METRICS:
            raise ValueError(f"`metric` parameter should be one of {METRICS}")
        self.metric = metric

        # FTS-index build params (forwarded to ``PRAGMA create_fts_index``).
        # ``None`` defers to the DuckDB default for that arg so we don't
        # have to track upstream defaults (``stopwords='english'``,
        # ``strip_accents=1``, ``lower=1``, etc.) — only user-supplied
        # values get serialized into the pragma.
        self.stopwords = stopwords
        self.ignore = ignore
        self.strip_accents = strip_accents
        self.lower = lower

        # HNSW build params (forwarded to ``CREATE INDEX ... WITH (...)``).
        # Same None-defers-to-engine convention as the FTS knobs and as the
        # Ladybug adapter's ``mu`` / ``ml`` / ``pu`` / ``efc``.
        self.ef_construction = ef_construction
        self.ef_search = ef_search
        self.M = M
        self.M0 = M0

        self.vss_key = vss_key

        # Single persistent RW connection — DuckDB holds an exclusive file
        # lock for its lifetime, so this process is the only writer.
        self._con: Optional[duckdb.DuckDBPyConnection] = None

        self._install_extensions()

        # Encrypted DBs must be created through ATTACH-with-key, not bare connect.
        if not os.path.exists(self.uri):
            tmp = duckdb.connect(":memory:")
            try:
                self._attach_db(tmp)
                tmp.execute(f"DETACH {_ATTACH_ALIAS}")
            finally:
                tmp.close()

        self._con = self._open_main_connection()
        self._sandbox(self._con)

        if wipe_on_start:
            self.wipe_database()

        self.data_models: Dict[str, Any] = {}
        if data_models:
            for dm in data_models:
                title = dm.get_schema().get("title")
                if title is None:
                    raise ValueError(
                        "Each registered data model must carry a "
                        "schema `title`; got a schema with no title."
                    )
                self.data_models[table_identifier(title)] = dm
                if not self._defer_table_creation:
                    self._maybe_create_table(dm)
        else:
            for dm in self.get_symbolic_data_models():
                self.data_models[dm.get_schema().get("title")] = dm

    def _attach_db(self, con, read_only=False):
        """ATTACH the configured database file onto ``con``.

        The path is interpolated as a SQL string literal (with single-
        quote escaping as defence in depth — ``uri`` is documented as
        caller-trusted). The encryption key, if any, is bound as a
        ``?`` parameter so arbitrary key bytes can't break the SQL or
        end up in logs.
        """
        safe_uri = self.uri.replace("'", "''")
        options = []
        params: List[Any] = []
        if self._encryption_key is not None:
            options.append("ENCRYPTION_KEY ?")
            params.append(self._encryption_key)
        if read_only:
            options.append("READ_ONLY")
        opts_clause = f" ({', '.join(options)})" if options else ""
        con.execute(
            f"ATTACH '{safe_uri}' AS {_ATTACH_ALIAS}{opts_clause}",
            params,
        )

    def _open_main_connection(self):
        """Open the persistent connection used by every operation."""
        if self._encryption_key is not None:
            con = duckdb.connect(":memory:")
            self._attach_db(con)
            con.execute(f"USE {_ATTACH_ALIAS}")
            return con
        return duckdb.connect(self.uri, read_only=False)

    def _open_loose_connection(self):
        """Open a fresh, NON-sandboxed connection for native bulk loads.

        ``read_csv`` / ``read_parquet`` need ``enable_external_access=true``,
        which the sandboxed persistent connection has permanently disabled.
        """
        if self._encryption_key is not None:
            con = duckdb.connect(":memory:")
            self._attach_db(con)
            con.execute(f"USE {_ATTACH_ALIAS}")
        else:
            con = duckdb.connect(self.uri, read_only=False)
        try:
            con.execute("LOAD fts;")
            if self.embedding_model:
                con.execute("LOAD vss;")
                con.execute("SET hnsw_enable_experimental_persistence=true;")
        except duckdb.Error:
            pass
        return con

    @contextmanager
    def _connect(self, read_only=False):
        """Yield the adapter's cached connection.

        ``read_only`` only changes the bootstrap throwaway path; the cached
        connection is always RW (see ``__init__``). ``query(read_only=True)``
        enforces SELECT-only at the parser layer for untrusted SQL.
        """
        if self._con is None:
            if self._encryption_key is not None:
                con = duckdb.connect(":memory:")
                self._attach_db(con, read_only=read_only)
                con.execute(f"USE {_ATTACH_ALIAS}")
            else:
                con = duckdb.connect(self.uri, read_only=read_only)
            try:
                yield con
            finally:
                con.close()
        else:
            yield self._con

    def _sandbox(self, con):
        """Lock the persistent connection down against external I/O.

        Without ``enable_external_access=false``, a query like
        ``SELECT * FROM read_csv('/etc/passwd', ...)`` passes parser-level
        SELECT-only checks and exfiltrates host files. Extensions must be
        LOAD-ed before the switch flips — LOAD itself needs external access.
        """
        try:
            con.execute("LOAD fts;")
            if self.embedding_model:
                con.execute("LOAD vss;")
                con.execute("SET hnsw_enable_experimental_persistence=true;")
        except duckdb.Error as e:
            warnings.warn(
                f"DuckDB extension LOAD failed; FTS / VSS queries will fail "
                f"until the extensions install successfully. ({e})"
            )
        con.execute("SET enable_external_access=false;")

    def close(self):
        """Close the adapter's cached connection. Idempotent."""
        if self._con is not None:
            try:
                self._con.close()
            except duckdb.Error:
                pass
            self._con = None

    def __del__(self):
        try:
            self.close()
        except Exception:
            pass

    def _install_extensions(self):
        """Install required extensions only if missing, via throwaway connections."""
        needed = ["fts"]
        if self.embedding_model:
            needed.append("vss")

        try:
            probe = duckdb.connect(":memory:")
            try:
                installed = {
                    row[0]
                    for row in probe.execute(
                        "SELECT extension_name FROM duckdb_extensions() WHERE installed"
                    ).fetchall()
                }
            finally:
                probe.close()
            missing = [ext for ext in needed if ext not in installed]
            if not missing:
                return
        except duckdb.PermissionException:
            # Sibling adapter already disabled external access process-wide.
            return
        except duckdb.Error:
            missing = needed

        installer = duckdb.connect(":memory:")
        try:
            for ext in missing:
                installer.execute(f"INSTALL {ext};")
        except duckdb.PermissionException:
            pass
        except duckdb.Error as e:
            warnings.warn(
                f"Failed to install DuckDB extensions {missing}; "
                f"FTS / VSS queries will not work until the extensions "
                f"are available. ({e})"
            )
        finally:
            installer.close()

    def wipe_database(self):
        """Drop every table in the ``main`` schema, clearing all data."""
        with self._connect(read_only=False) as con:
            tables = con.execute(f"""
                SELECT table_name 
                FROM information_schema.tables 
                WHERE table_schema='{MAIN_TABLE}';
            """).fetchall()

            for (table_name,) in tables:
                try:
                    con.execute(f"DROP TABLE IF EXISTS {table_name}")
                except Exception as e:
                    raise RuntimeError(f"Failed to drop table {table_name}: {e}")

    def _get_id_key(self, schema: dict) -> str:
        """Return the first schema property (snake-cased, sanitized) as primary key."""
        props = schema.get("properties") if isinstance(schema, dict) else None
        if not props:
            raise ValueError("Cannot determine primary key: schema has no `properties`.")
        return column_identifier(next(iter(props.keys())))

    def _has_indexable_text_columns(self, schema: dict) -> bool:
        """Return ``True`` if the schema has at least one VARCHAR column
        besides the primary key. Used to gate FTS index creation and
        BM25 queries: DuckDB's ``create_fts_index('*', ...)`` requires
        at least one indexable text column, and the id column is
        excluded by the pragma itself.
        """
        id_key = self._get_id_key(schema)
        for name, info in schema.get("properties", {}).items():
            if column_identifier(name) == id_key:
                continue
            if (
                info.get("type") == "string"
                and info.get("format") not in _DATE_LIKE_FORMATS
            ):
                return True
        return False

    def _duckdb_table_to_json_schema(
        self,
        table_name: str,
        remove_embedding: bool = True,
    ) -> dict:
        with self._connect(read_only=True) as con:
            info = con.execute(f"PRAGMA table_info('{table_name}')").fetchall()
            props = {}
            for _, name, dtype, _, _, _ in info:
                if name == self.vss_key and remove_embedding:
                    continue
                elif dtype == duckdb.sqltypes.DuckDBPyType(str):
                    props[name] = {"title": name.title(), "type": "string"}
                elif dtype == duckdb.sqltypes.DuckDBPyType(float):
                    props[name] = {"title": name.title(), "type": "number"}
                elif dtype == duckdb.sqltypes.DuckDBPyType(int):
                    props[name] = {"title": name.title(), "type": "integer"}
                elif dtype == duckdb.sqltypes.DuckDBPyType(bool):
                    props[name] = {"title": name.title(), "type": "boolean"}
                elif dtype == duckdb.sqltypes.DuckDBPyType(list[Union[str]]):
                    props[name] = {
                        "title": name.title(),
                        "items": {"type": "string"},
                        "type": "array",
                    }
                elif dtype == duckdb.sqltypes.DuckDBPyType(list[Union[float]]):
                    props[name] = {
                        "title": name.title(),
                        "items": {"type": "number"},
                        "type": "array",
                    }
                elif dtype == duckdb.sqltypes.DuckDBPyType(list[Union[int]]):
                    props[name] = {
                        "title": name.title(),
                        "items": {"type": "integer"},
                        "type": "array",
                    }
                elif dtype == duckdb.sqltypes.DuckDBPyType(list[Union[bool]]):
                    props[name] = {
                        "title": name.title(),
                        "items": {"type": "boolean"},
                        "type": "array",
                    }
                elif dtype == duckdb.sqltypes.DATE:
                    props[name] = {
                        "title": name.title(),
                        "type": "string",
                        "format": "date",
                    }
                elif dtype in (duckdb.sqltypes.TIMESTAMP, duckdb.sqltypes.TIMESTAMP_TZ):
                    props[name] = {
                        "title": name.title(),
                        "type": "string",
                        "format": "date-time",
                    }
                elif dtype == duckdb.sqltypes.TIME:
                    props[name] = {
                        "title": name.title(),
                        "type": "string",
                        "format": "time",
                    }
                elif str(dtype) == "JSON":
                    props[name] = {
                        "title": name.title(),
                        "type": "object",
                    }
                else:
                    raise NotImplementedError(
                        f"Type '{dtype}' not supported by {self.__class__.__name__}"
                        " at the moment, please fill out an issue."
                    )

            return {
                "title": table_name,
                "type": "object",
                "additionalProperties": False,
                "required": list(props.keys()),
                "properties": props,
            }

    def _json_schema_to_duckdb_columns(self, json_schema: dict):
        """Convert JSON schema to DuckDB column definitions. First property is PK."""
        properties = json_schema.get("properties", {})
        defs = json_schema.get("$defs", {})
        out = []
        first_col = True

        for prop_name, prop_spec in properties.items():
            prop_name = column_identifier(prop_name)

            # Resolve Pydantic v2 $ref for enums / nested models.
            if "$ref" in prop_spec:
                ref_name = prop_spec["$ref"].rsplit("/", 1)[-1]
                if ref_name in defs:
                    resolved = dict(defs[ref_name])
                    resolved.update({k: v for k, v in prop_spec.items() if k != "$ref"})
                    prop_spec = resolved

            prop_type = prop_spec.get("type")

            if prop_name == self.vss_key:
                continue

            if not prop_type and "anyOf" in prop_spec:
                for variant in prop_spec["anyOf"]:
                    if "$ref" in variant:
                        ref_name = variant["$ref"].rsplit("/", 1)[-1]
                        if ref_name in defs:
                            variant = defs[ref_name]
                    vtype = variant.get("type")
                    if vtype and vtype != "null":
                        prop_type = vtype
                        prop_spec = variant
                        break
                    if "enum" in variant:
                        prop_type = "string"
                        break

            if not prop_type and "enum" in prop_spec:
                prop_type = "string"

            if not prop_type:
                raise ValueError(f"Malformed JSON schema: missing type for '{prop_name}'")

            col_def = None

            if prop_type == "array":
                item_spec = prop_spec.get("items")
                if not item_spec:
                    col_def = f"{prop_name} JSON"
                else:
                    item_type = item_spec.get("type")
                    if item_type == "string":
                        dtype = duckdb.sqltypes.DuckDBPyType(list[Union[str]])
                        col_def = f"{prop_name} {dtype}"
                    elif item_type == "number":
                        dtype = duckdb.sqltypes.DuckDBPyType(list[Union[float]])
                        col_def = f"{prop_name} {dtype}"
                    elif item_type == "integer":
                        dtype = duckdb.sqltypes.DuckDBPyType(list[Union[int]])
                        col_def = f"{prop_name} {dtype}"
                    elif item_type == "boolean":
                        dtype = duckdb.sqltypes.DuckDBPyType(list[Union[bool]])
                        col_def = f"{prop_name} {dtype}"
                    elif item_type == "object":
                        col_def = f"{prop_name} JSON"
                    else:
                        raise ValueError(
                            f"Unsupported array item type '{item_type}' for '{prop_name}'"
                        )
            elif prop_type == "object":
                col_def = f"{prop_name} JSON"
            elif prop_type == "string":
                fmt = prop_spec.get("format")
                if fmt == "date":
                    col_def = f"{prop_name} DATE"
                elif fmt == "date-time":
                    col_def = f"{prop_name} TIMESTAMP"
                elif fmt == "time":
                    col_def = f"{prop_name} TIME"
                else:
                    col_def = f"{prop_name} VARCHAR"
            elif prop_type == "number":
                dtype = duckdb.sqltypes.DuckDBPyType(float)
                col_def = f"{prop_name} {dtype}"
            elif prop_type == "integer":
                dtype = duckdb.sqltypes.DuckDBPyType(int)
                col_def = f"{prop_name} {dtype}"
            elif prop_type == "boolean":
                dtype = duckdb.sqltypes.DuckDBPyType(bool)
                col_def = f"{prop_name} {dtype}"
            else:
                raise ValueError(f"Unsupported JSON schema type: '{prop_type}'")

            if first_col and col_def:
                col_def += " PRIMARY KEY"
                first_col = False

            if col_def:
                out.append(col_def)

        if self.embedding_model:
            out.append(f"{self.vss_key} FLOAT[{self.vector_dim}]")
        return ", ".join(out)

    def get_symbolic_data_models(
        self,
        remove_embedding=True,
    ) -> List[SymbolicDataModel]:
        """Reflect every table in the ``main`` schema into a symbolic model.

        Args:
            remove_embedding (bool): Strip the embedding column from each
                reflected schema. Defaults to ``True``.

        Returns:
            List[SymbolicDataModel]: One ``SymbolicDataModel`` per table,
                representing the current database schema.
        """
        with self._connect(read_only=True) as con:
            tables = con.execute("""
                SELECT table_name 
                FROM information_schema.tables 
                WHERE table_schema='main';
            """).fetchall()

            symbolic_data_models = []
            for (table_name,) in tables:
                schema = self._duckdb_table_to_json_schema(table_name)
                model = SymbolicDataModel(schema=schema, name=table_name)
                symbolic_data_models.append(model)
            return symbolic_data_models

    async def _ensure_vector_dim(self, sample_vector=None):
        """Resolve the embedding dimension lazily, on the current event loop.

        Prefers the length of an embedding vector already in hand (records
        arrive pre-embedded from ``EmbedKnowledge``), falling back to a probe
        awaited on *this* loop only when the vector column must be created
        before any embedded record is available. Never uses ``run_maybe_nested``
        (a transient-loop network call poisons litellm's global client).
        """
        if self.vector_dim is not None or not self.embedding_model:
            return
        if sample_vector:
            self.vector_dim = len(sample_vector)
            return
        probe = await self.embedding_model(EmbeddingRequest(texts=["text"]))
        embeddings = probe.get("embeddings") if probe is not None else None
        if not embeddings:
            raise ValueError(
                f"Embedding model {self.embedding_model} returned no embeddings "
                "while resolving the vector dimension. This usually means the "
                "model name is wrong or unavailable for your provider/API key. "
                "Fix the embedding model, or pass an explicit `vector_dim=...`."
            )
        self.vector_dim = len(embeddings[0])

    def _maybe_create_table(
        self,
        data_model: Union[JsonDataModel, SymbolicDataModel],
    ):
        with self._connect(read_only=False) as con:
            json_schema = data_model.get_schema()
            table_name = table_identifier(json_schema.get("title"))

            exists = con.execute(f"""
                SELECT COUNT(*) 
                FROM information_schema.tables 
                WHERE table_schema='{MAIN_TABLE}' AND table_name='{table_name}';
            """).fetchone()[0]

            if exists:
                return

            columns = self._json_schema_to_duckdb_columns(json_schema)
            create_sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({columns});"

            try:
                con.execute(create_sql)
            except Exception as e:
                raise RuntimeError(f"Failed to create table '{table_name}': {e}")

    @staticmethod
    def _render_sql_literal(value: Any) -> str:
        """Render a Python value as a SQL literal for DDL/PRAGMA options.

        Used to interpolate adapter init kwargs (stemmer, stopwords,
        HNSW ``WITH`` options, etc.) into statements where DuckDB
        rejects ``?``-parameters at the parser layer. Bools become
        ``0`` / ``1`` (DuckDB's accepted BOOLEAN literal forms in
        PRAGMA / WITH clauses); ints and floats are emitted bare;
        strings are single-quoted with embedded quotes doubled.
        """
        if isinstance(value, bool):
            return "1" if value else "0"
        if isinstance(value, (int, float)):
            return str(value)
        return "'" + str(value).replace("'", "''") + "'"

    def _fts_index_options(self, overwrite: bool) -> str:
        """Render the optional kwargs for ``PRAGMA create_fts_index``.

        Only adapter init values that were explicitly supplied (i.e.
        not ``None``) get serialized — DuckDB picks its own defaults
        otherwise, so this avoids hard-coding upstream defaults
        (``stopwords='english'``, ``strip_accents=1`` etc.) into the
        adapter. ``overwrite`` is always emitted because callers pass
        an explicit ``True`` / ``False`` per rebuild.
        """
        opts: List[str] = [f"stemmer='{self.stemmer}'"]
        for key, value in (
            ("stopwords", self.stopwords),
            ("ignore", self.ignore),
            ("strip_accents", self.strip_accents),
            ("lower", self.lower),
        ):
            if value is None:
                continue
            opts.append(f"{key}={self._render_sql_literal(value)}")
        opts.append(f"overwrite={1 if overwrite else 0}")
        return ", ".join(opts)

    def _maybe_create_fulltext_index(
        self,
        data_model: Union[JsonDataModel, SymbolicDataModel],
        overwrite: bool = True,
    ):
        with self._connect(read_only=False) as con:
            json_schema = data_model.get_schema()
            table_name = table_identifier(json_schema.get("title"))
            id_key = self._get_id_key(json_schema)

            if not self._has_indexable_text_columns(json_schema):
                return
            # '*' tells DuckDB to index every VARCHAR column (the id
            # column is excluded by the pragma itself), so callers
            # don't need a whitelist of "searchable" field names.
            con.execute(f"""
                PRAGMA create_fts_index(
                    'main.{table_name}',
                    '{id_key}',
                    '*',
                    {self._fts_index_options(overwrite)}
                );
            """)

    def _hnsw_with_clause(self) -> str:
        """Render the ``WITH (...)`` clause for ``CREATE INDEX ... USING HNSW``.

        ``metric`` is always emitted (the adapter validates it at
        init); the other knobs (``ef_construction``, ``ef_search``,
        ``M``, ``M0``) follow the None-defers-to-engine convention so
        users only pin the values they actually want to tune.
        """
        opts: List[str] = [f"metric = '{self.metric}'"]
        for key, value in (
            ("ef_construction", self.ef_construction),
            ("ef_search", self.ef_search),
            ("M", self.M),
            ("M0", self.M0),
        ):
            if value is None:
                continue
            opts.append(f"{key} = {self._render_sql_literal(value)}")
        return ", ".join(opts)

    def _maybe_create_vector_index(
        self,
        data_model: Union[JsonDataModel, SymbolicDataModel],
        overwrite: bool = True,
    ):
        """Build (or rebuild) the HNSW vector index. No-op without embeddings."""
        if not self.embedding_model:
            return

        with self._connect(read_only=False) as con:
            json_schema = data_model.get_schema()
            table_name = table_identifier(json_schema.get("title"))

            has_embeddings = con.execute(
                f"SELECT EXISTS ("
                f"SELECT 1 FROM {table_name} "
                f"WHERE {self.vss_key} IS NOT NULL"
                f")"
            ).fetchone()[0]
            if not has_embeddings:
                return

            index_name = f"vector_main_{table_name}"
            with_clause = self._hnsw_with_clause()
            if overwrite:
                # HNSW indexes don't support CREATE OR REPLACE.
                con.execute(f"DROP INDEX IF EXISTS {index_name};")
                con.execute(
                    f"CREATE INDEX {index_name} ON {table_name}"
                    f" USING HNSW ({self.vss_key})"
                    f" WITH ({with_clause});"
                )
            else:
                con.execute(
                    f"CREATE INDEX IF NOT EXISTS {index_name} ON {table_name}"
                    f" USING HNSW ({self.vss_key})"
                    f" WITH ({with_clause});"
                )

    async def update(
        self,
        data_model_or_data_models: Union[List[JsonDataModel], JsonDataModel],
    ) -> Union[Any, List[Any]]:
        """Update or insert records. Returns the primary key value(s)."""
        if not isinstance(data_model_or_data_models, list):
            data_models = [data_model_or_data_models]
            return_single = True
        else:
            data_models = data_model_or_data_models
            return_single = False

        # Bucket by (table, column-shape) so each bucket uses one executemany.
        ids: List[Any] = []
        buckets: Dict[tuple, Dict[str, Any]] = {}
        tables_seen: List[Any] = []
        tables_seen_set: set = set()

        for data_model in data_models:
            if not isinstance(data_model, JsonDataModel):
                data_model = data_model.to_json_data_model()

            schema = data_model.get_schema()
            table = table_identifier(schema["title"])
            json_data = sanitize_properties(data_model.get_json())
            id_key = self._get_id_key(schema)

            id_val = json_data.get(id_key)
            if id_val is None:
                raise ValueError(f"Primary key '{id_key}' is required but not provided")

            if table not in tables_seen_set:
                # Learn the dimension from this record's embedding (or an
                # on-loop probe) before the FLOAT[dim] column is created.
                await self._ensure_vector_dim(json_data.get(self.vss_key))
                self._maybe_create_table(data_model)
                tables_seen.append(data_model)
                tables_seen_set.add(table)

            cols = tuple(json_data.keys())
            bucket = buckets.setdefault(
                (table, cols),
                {"id_key": id_key, "cols": cols, "params": []},
            )
            bucket["params"].append([json_data[c] for c in cols])
            ids.append(id_val)

        with self._connect(read_only=False) as con:
            con.execute("BEGIN TRANSACTION;")
            try:
                for (table, _cols), bucket in buckets.items():
                    cols = bucket["cols"]
                    id_key = bucket["id_key"]
                    col_sql = ", ".join(cols)
                    placeholders = ", ".join(["?"] * len(cols))
                    update_cols = [c for c in cols if c != id_key]

                    if update_cols:
                        update_sql = ", ".join(f"{c} = EXCLUDED.{c}" for c in update_cols)
                        conflict_clause = (
                            f"ON CONFLICT ({id_key}) DO UPDATE SET {update_sql}"
                        )
                    else:
                        conflict_clause = f"ON CONFLICT ({id_key}) DO NOTHING"

                    sql = (
                        f"INSERT INTO {table} ({col_sql}) "
                        f"VALUES ({placeholders}) "
                        f"{conflict_clause};"
                    )

                    con.executemany(sql, bucket["params"])
            except Exception:
                con.execute("ROLLBACK;")
                raise
            con.execute("COMMIT;")

        # FTS/HNSW rebuilds are best-effort — data is already committed.
        for data_model in tables_seen:
            try:
                self._maybe_create_fulltext_index(data_model)
            except Exception as e:
                table = table_identifier(data_model.get_schema()["title"])
                warnings.warn(
                    f"FTS index rebuild failed for '{table}'; "
                    f"fulltext_search results may be stale. ({e})"
                )

        for data_model in tables_seen:
            try:
                self._maybe_create_vector_index(data_model)
            except Exception as e:
                table = table_identifier(data_model.get_schema()["title"])
                warnings.warn(
                    f"Vector index rebuild failed for '{table}'; "
                    f"similarity_search will fall back to scan. ({e})"
                )

        return ids[0] if return_single else ids

    async def from_csv(
        self,
        path: str,
        *,
        table_name: Optional[str] = None,
        table_description: Optional[str] = None,
        delimiter: str = ",",
        encoding: str = "utf-8",
        header: bool = True,
    ) -> SymbolicDataModel:
        """Bulk-load a CSV file directly into a new (or existing) table.

        Uses DuckDB's native ``read_csv`` so column types are
        auto-detected from the file (with the conservative bias that
        zero-padded ids like ``"00123"`` stay ``VARCHAR``). The first
        column is promoted to ``PRIMARY KEY``. Column names are
        normalized to ``snake_case``; the table name to ``PascalCase``.

        Use this over ``update(CSVDataset(...))`` for non-trivial files
        — the bulk path is orders of magnitude faster because it
        bypasses the per-row Pydantic / Python pipeline. Prefer
        ``update`` when source rows need transformation before storage.

        Args:
            path: Path to the CSV file.
            table_name: Target table name. Defaults to the file's stem
                (``/data/my-docs.csv`` → ``MyDocs``). Always normalized
                to PascalCase.
            table_description: Optional natural-language description
                attached to the resulting schema's top-level
                ``description`` field.
            delimiter: Field delimiter. Defaults to ``","``.
            encoding: File encoding. Defaults to ``"utf-8"``.
            header: Whether the first row is a header. Defaults to
                ``True``.

        Returns:
            The `SymbolicDataModel` for the loaded table. Pass
            it to ``fulltext_search`` / ``similarity_search`` / ``get``
            to query the data.
        """
        reader_kwargs = {
            "delim": f"'{delimiter.replace(chr(39), chr(39) * 2)}'",
            "header": "true" if header else "false",
            "encoding": f"'{encoding.replace(chr(39), chr(39) * 2)}'",
        }
        return await self._bulk_load(
            path,
            table_name=table_name,
            table_description=table_description,
            reader_fn="read_csv",
            reader_kwargs=reader_kwargs,
        )

    async def from_parquet(
        self,
        path: str,
        *,
        table_name: Optional[str] = None,
        table_description: Optional[str] = None,
    ) -> SymbolicDataModel:
        """Bulk-load a Parquet file directly into a new (or existing) table.

        Same fast-path trade-offs as `from_csv` — bypasses the
        Python row pipeline for native DuckDB ingestion. Parquet's
        schema is explicit in the file's footer, so types are
        preserved end-to-end without auto-detection guesswork.

        Args:
            path: Path to the Parquet file.
            table_name: Target table name. Defaults to the file's stem,
                PascalCase-normalized.
            table_description: Optional schema description.

        Returns:
            The `SymbolicDataModel` for the loaded table.
        """
        return await self._bulk_load(
            path,
            table_name=table_name,
            table_description=table_description,
            reader_fn="read_parquet",
            reader_kwargs={},
        )

    async def from_json(
        self,
        path: str,
        *,
        table_name: Optional[str] = None,
        table_description: Optional[str] = None,
    ) -> SymbolicDataModel:
        """Bulk-load a JSON file (top-level array of objects).

        Same fast-path trade-offs as `from_csv` /
        `from_parquet`. Use `from_jsonl` for one-object-
        per-line NDJSON sources.

        Args:
            path: Path to the JSON file. Must contain a top-level array
                of objects, e.g. ``[{"id": "a", "text": "..."}, ...]``.
            table_name: Target table name. Defaults to the file's stem,
                PascalCase-normalized.
            table_description: Optional schema description.

        Returns:
            The `SymbolicDataModel` for the loaded table.
        """
        return await self._bulk_load(
            path,
            table_name=table_name,
            table_description=table_description,
            reader_fn="read_json",
            reader_kwargs={"format": "'array'"},
        )

    async def from_jsonl(
        self,
        path: str,
        *,
        table_name: Optional[str] = None,
        table_description: Optional[str] = None,
    ) -> SymbolicDataModel:
        """Bulk-load a JSON Lines (NDJSON) file.

        Same fast-path trade-offs as `from_csv` /
        `from_parquet`. Use this for very large JSON sources that
        aren't a single array.

        Args:
            path: Path to the JSONL file.
            table_name: Target table name. Defaults to the file's stem,
                PascalCase-normalized.
            table_description: Optional schema description.

        Returns:
            The `SymbolicDataModel` for the loaded table.
        """
        return await self._bulk_load(
            path,
            table_name=table_name,
            table_description=table_description,
            reader_fn="read_json",
            reader_kwargs={"format": "'newline_delimited'"},
        )

    async def _bulk_load(
        self,
        path: str,
        *,
        table_name: Optional[str],
        table_description: Optional[str],
        reader_fn: str,
        reader_kwargs: Dict[str, str],
    ) -> SymbolicDataModel:
        """Native DuckDB bulk load shared by the ``from_csv`` / ``from_parquet``
        / ``from_json`` / ``from_jsonl`` helpers.

        Cycles the sandboxed connection out for a loose one because
        ``read_*`` table functions need ``enable_external_access=true``.
        """
        if not os.path.exists(path):
            raise FileNotFoundError(f"File not found: {path}")

        raw_name = (
            table_name
            if table_name is not None
            else os.path.splitext(os.path.basename(path))[0]
        )
        if not to_pascal_case(raw_name):
            raise ValueError(
                f"Cannot derive a table name from {raw_name!r}: "
                "after PascalCase normalization, no alphanumeric "
                "content remains. Pass an explicit `table_name=`."
            )
        table_name = table_identifier(raw_name)

        if reader_kwargs:
            reader_extras = ", " + ", ".join(f"{k}={v}" for k, v in reader_kwargs.items())
        else:
            reader_extras = ""
        reader_sql = f"{reader_fn}(?{reader_extras})"

        self._con.close()
        self._con = None
        try:
            loose = self._open_loose_connection()
            try:
                desc_rows = loose.execute(
                    f"DESCRIBE SELECT * FROM {reader_sql}",
                    [path],
                ).fetchall()
                if not desc_rows:
                    raise ValueError(f"{path}: no columns detected in source file.")

                # orig_col_names quote-survive header values; db_col_names
                # are snake_cased. INSERT maps positionally between them.
                col_defs = []
                orig_col_names = []
                db_col_names = []
                for i, row in enumerate(desc_rows):
                    orig_name = row[0]
                    db_name = column_identifier(orig_name)
                    orig_col_names.append(orig_name)
                    db_col_names.append(db_name)
                    col_type = row[1]
                    if i == 0:
                        col_defs.append(f"{db_name} {col_type} PRIMARY KEY")
                    else:
                        col_defs.append(f"{db_name} {col_type}")
                pk_col = db_col_names[0]

                loose.execute(
                    f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join(col_defs)})"
                )

                update_cols = [c for c in db_col_names if c != pk_col]
                if update_cols:
                    update_sql = ", ".join(f"{c} = EXCLUDED.{c}" for c in update_cols)
                    conflict_clause = f"ON CONFLICT ({pk_col}) DO UPDATE SET {update_sql}"
                else:
                    conflict_clause = f"ON CONFLICT ({pk_col}) DO NOTHING"

                def _quote_src(c):
                    return '"' + c.replace('"', '""') + '"'

                select_list = ", ".join(_quote_src(c) for c in orig_col_names)
                insert_col_list = ", ".join(db_col_names)
                loose.execute(
                    f"INSERT INTO {table_name} ({insert_col_list}) "
                    f"SELECT {select_list} FROM {reader_sql} "
                    f"{conflict_clause}",
                    [path],
                )
            finally:
                loose.close()
        finally:
            self._con = self._open_main_connection()
            self._sandbox(self._con)

        schema = self._duckdb_table_to_json_schema(table_name)
        schema["title"] = table_name
        if table_description is not None:
            schema["description"] = table_description
        symbolic_model = SymbolicDataModel(schema=schema, name=table_name)

        # setdefault: preserve any richer caller-supplied DataModel.
        self.data_models.setdefault(table_name, symbolic_model)

        try:
            self._maybe_create_fulltext_index(symbolic_model)
        except Exception as e:
            warnings.warn(
                f"FTS index rebuild failed for '{table_name}'; "
                f"fulltext_search results may be stale. ({e})"
            )

        try:
            self._maybe_create_vector_index(symbolic_model)
        except Exception as e:
            warnings.warn(
                f"Vector index rebuild failed for '{table_name}'; "
                f"similarity_search will fall back to scan. ({e})"
            )

        return symbolic_model

    async def rename(
        self,
        source: Union[Any, str],
        *,
        table_name: Optional[str] = None,
        table_description: Optional[str] = None,
    ) -> SymbolicDataModel:
        """Rename a table and/or update its schema description.

        At least one of ``table_name`` / ``table_description`` must be
        given. When ``table_name`` changes, the FTS and HNSW indexes
        are dropped and rebuilt under the new name, and the adapter's
        known-tables registry is updated so subsequent searches find
        the table at its new name.

        Args:
            source: ``SymbolicDataModel`` for the table to rename, or
                its name as a string. String form is PascalCase-
                normalized so callers can pass the original filename-
                style input they used in `from_csv`.
            table_name: New table name. Optional. Always normalized to
                PascalCase.
            table_description: New schema description. Optional. Lives
                in the ``SymbolicDataModel`` layer (DuckDB doesn't
                carry per-table descriptions natively).

        Returns:
            A fresh `SymbolicDataModel` for the (possibly
            renamed) table, reflecting the post-rename column shape
            and the supplied description.
        """
        if table_name is None and table_description is None:
            raise ValueError(
                "rename(): pass at least one of `table_name=` or `table_description=`."
            )

        if isinstance(source, str):
            raw_old = source
        else:
            raw_old = source.get_schema().get("title")
            if not raw_old:
                raise ValueError(
                    "rename(): source SymbolicDataModel has no schema "
                    "title; cannot determine the table to rename."
                )
        old_name = table_identifier(raw_old)

        with self._connect(read_only=True) as con:
            exists = con.execute(
                f"SELECT COUNT(*) FROM information_schema.tables "
                f"WHERE table_schema='{MAIN_TABLE}' AND table_name=?",
                [old_name],
            ).fetchone()[0]
        if not exists:
            raise ValueError(
                f"rename(): no table named {old_name!r} found in the knowledge base."
            )

        new_name = old_name
        if table_name is not None:
            new_name = table_identifier(table_name)

            if new_name != old_name:
                # FTS/HNSW indexes are name-bound; drop then rebuild.
                with self._connect(read_only=False) as con:
                    try:
                        con.execute(f"PRAGMA drop_fts_index('main.{old_name}');")
                    except duckdb.Error:
                        pass

                    old_vector_index = f"vector_main_{old_name}"
                    con.execute(f"DROP INDEX IF EXISTS {old_vector_index};")

                    con.execute(f"ALTER TABLE {old_name} RENAME TO {new_name};")

        schema = self._duckdb_table_to_json_schema(new_name)
        schema["title"] = new_name
        if table_description is not None:
            schema["description"] = table_description
        else:
            if not isinstance(source, str):
                old_schema = source.get_schema()
                if "description" in old_schema:
                    schema["description"] = old_schema["description"]
        renamed_model = SymbolicDataModel(schema=schema, name=new_name)

        self.data_models.pop(old_name, None)
        self.data_models[new_name] = renamed_model

        try:
            self._maybe_create_fulltext_index(renamed_model)
        except Exception as e:
            warnings.warn(
                f"FTS index rebuild failed for '{new_name}'; "
                f"fulltext_search results may be stale. ({e})"
            )
        try:
            self._maybe_create_vector_index(renamed_model)
        except Exception as e:
            warnings.warn(
                f"Vector index rebuild failed for '{new_name}'; "
                f"similarity_search will fall back to scan. ({e})"
            )

        return renamed_model

    async def get(
        self,
        id_or_ids: Union[Any, List[Any]],
        *,
        table_name: str,
        remove_embedding: bool = True,
    ) -> Union[Optional[JsonDataModel], List[Optional[JsonDataModel]]]:
        """Look up one or more records by primary key in a single table.

        Args:
            id_or_ids: A single primary key value, or a list of values.
            table_name: Target table.
            remove_embedding: Strip the embedding column from the
                returned records. Defaults to ``True`` to keep results
                LM-friendly.

        Returns:
            For a scalar id: the matching ``JsonDataModel``, or ``None``
            if not found. For a list of ids: a list in the same order,
            with ``None`` in the slots that didn't match.
        """
        return_single = not isinstance(id_or_ids, list)
        ids = [id_or_ids] if return_single else list(id_or_ids)

        if not ids:
            return None if return_single else []

        table = table_identifier(table_name)
        json_schema = self._duckdb_table_to_json_schema(table)
        id_key = self._get_id_key(json_schema)

        results: List[Optional[JsonDataModel]] = [None] * len(ids)

        with self._connect(read_only=True) as con:
            placeholders = ", ".join(["?"] * len(ids))
            try:
                sql = f"SELECT * FROM {table} WHERE {id_key} IN ({placeholders})"
                cursor = con.execute(sql, ids)
            except Exception as e:
                warnings.warn(f"get(): SELECT from '{table}' failed. ({e})")
                return None if return_single else results

            rows = cursor.arrow().read_all().to_pylist()
            if not rows:
                return None if return_single else results

            json_columns = _get_json_columns_from_schema(json_schema)
            rows_by_id = {row[id_key]: row for row in rows}

            for idx, id_val in enumerate(ids):
                row = rows_by_id.get(id_val)
                if row is None:
                    continue
                json_data = _parse_json_columns(row, json_columns)
                if remove_embedding and self.vss_key in json_data:
                    json_data.pop(self.vss_key)
                results[idx] = JsonDataModel(
                    json=json_data,
                    schema=json_schema,
                    name=str(json_data.get(id_key)),
                )

        return results[0] if return_single else results

    async def getall(
        self,
        *,
        table_name: str,
        limit: int = 50,
        offset: int = 0,
        remove_embedding: bool = True,
    ) -> List[JsonDataModel]:
        """List rows from a single table, paginated.

        Returns an empty list (with a warning) if the table doesn't
        exist, so callers can safely enumerate without pre-checking.

        Args:
            table_name: Target table.
            limit: Maximum number of records to return.
            offset: Number of records to skip.
            remove_embedding: Strip the embedding column from results.

        Returns:
            A list of `JsonDataModel` records.
        """
        table = table_identifier(table_name)
        try:
            json_schema = self._duckdb_table_to_json_schema(table)
        except duckdb.Error as e:
            warnings.warn(f"Failed to read table '{table}': {e}")
            return []
        id_key = self._get_id_key(json_schema)

        with self._connect(read_only=True) as con:
            sql = f"SELECT * FROM {table} LIMIT ? OFFSET ?"
            try:
                cursor = con.execute(sql, [limit, offset])
                rows = cursor.arrow().read_all().to_pylist()
            except duckdb.Error as e:
                warnings.warn(f"Failed to read table '{table}': {e}")
                return []

            if not rows:
                return []

            json_columns = _get_json_columns_from_schema(json_schema)

            results = []
            for row in rows:
                json_data = _parse_json_columns(row, json_columns)
                if remove_embedding and self.vss_key in json_data:
                    json_data.pop(self.vss_key)
                results.append(
                    JsonDataModel(
                        json=json_data,
                        schema=json_schema,
                        name=str(json_data.get(id_key)),
                    )
                )
            return results

    async def delete(
        self,
        id_or_ids: Union[Any, List[Any]],
        *,
        table_name: str,
    ) -> int:
        """Delete records by primary key from a single table.

        Rebuilds the FTS and HNSW indexes after the delete so
        subsequent search calls don't return ghost rows.

        Args:
            id_or_ids: A single primary key value, or a list of values.
            table_name: Target table.

        Returns:
            The number of rows actually deleted (0 if none matched or
            if the table doesn't exist).
        """
        ids = [id_or_ids] if not isinstance(id_or_ids, list) else list(id_or_ids)
        if not ids:
            return 0

        table = table_identifier(table_name)
        try:
            json_schema = self._duckdb_table_to_json_schema(table)
        except duckdb.Error:
            warnings.warn(f"delete(): no table named '{table}'; nothing to delete.")
            return 0
        id_key = self._get_id_key(json_schema)

        placeholders = ", ".join(["?"] * len(ids))
        sql = (
            f"DELETE FROM {table} WHERE {id_key} IN ({placeholders}) RETURNING {id_key};"
        )

        with self._connect(read_only=False) as con:
            try:
                rows = con.execute(sql, ids).fetchall()
            except Exception as e:
                raise RuntimeError(f"delete from '{table}' failed: {e}") from e

        deleted = len(rows)

        symbolic_model = self.data_models.get(table)
        if symbolic_model is None:
            symbolic_model = SymbolicDataModel(schema=json_schema, name=table)

        try:
            self._maybe_create_fulltext_index(symbolic_model)
        except Exception as e:
            warnings.warn(
                f"FTS index rebuild failed for '{table}' after delete; "
                f"fulltext_search results may be stale. ({e})"
            )
        try:
            self._maybe_create_vector_index(symbolic_model)
        except Exception as e:
            warnings.warn(
                f"Vector index rebuild failed for '{table}' after "
                f"delete; similarity_search will fall back to scan. ({e})"
            )

        return deleted

    async def drop_table(self, table_name: str) -> bool:
        """Drop a table and its associated FTS / HNSW indexes.

        Also removes the table from the adapter's known-tables
        registry so subsequent operations stop seeing it.

        Args:
            table_name: Target table.

        Returns:
            ``True`` if a table was dropped, ``False`` if no such
            table existed.
        """
        table = table_identifier(table_name)

        with self._connect(read_only=True) as con:
            exists = con.execute(
                "SELECT COUNT(*) FROM information_schema.tables "
                f"WHERE table_schema='{MAIN_TABLE}' AND table_name=?",
                [table],
            ).fetchone()[0]
        if not exists:
            return False

        with self._connect(read_only=False) as con:
            # FTS schema doesn't cascade-drop with the table.
            try:
                con.execute(f"PRAGMA drop_fts_index('main.{table}');")
            except duckdb.Error:
                pass

            vector_index = f"vector_main_{table}"
            con.execute(f"DROP INDEX IF EXISTS {vector_index};")

            con.execute(f"DROP TABLE IF EXISTS {table};")

        self.data_models.pop(table, None)
        return True

    async def sql(
        self,
        sql: str,
        *,
        params: Optional[List[Any]] = None,
        read_only: bool = True,
        output_format: str = "json",
        **kwargs,
    ):
        """Execute a raw SQL query against the database.

        Args:
            sql: The SQL string.
            params: Optional positional parameters for the query.
            read_only: When ``True`` (default), enforces SELECT-only at
                the parser layer to reject multi-statement injection,
                ``COPY ... TO '/path'`` filesystem writes, and other
                non-SELECT statements. Set ``False`` for trusted
                mutations.
            output_format: ``"json"`` (list of dicts, default) or
                ``"csv"`` (CSV string).
        """
        # read_only enforces SELECT-only at the parser. A connection-level
        # RO flag alone would let `COPY (...) TO '/path'` through.
        if read_only:
            try:
                statements = duckdb.extract_statements(sql)
            except duckdb.Error as e:
                raise duckdb.InvalidInputException(f"Invalid SQL: {e}") from e
            if not statements:
                raise duckdb.InvalidInputException("Empty SQL query.")
            for stmt in statements:
                if stmt.type != duckdb.StatementType.SELECT:
                    raise duckdb.InvalidInputException(
                        f"read_only=True only permits SELECT statements; "
                        f"got {stmt.type.name}."
                    )
        with self._connect(read_only=read_only) as con:
            arrow_table = con.execute(sql, params or []).arrow().read_all()
        return format_search_results(arrow_table, output_format)

    @contextmanager
    def _hnsw_ef_search_override(self, con, ef_search: Optional[int]):
        """Apply ``SET hnsw_ef_search = <n>`` for the duration of a block.

        DuckDB exposes ``hnsw_ef_search`` as a session-level override
        of the HNSW ``ef_search`` parameter baked into the index at
        ``CREATE INDEX`` time. ``None`` is a no-op so the default (or
        the index-time value) wins; otherwise we ``SET`` on entry and
        ``RESET`` on exit so we don't leak the override onto whatever
        the next caller wants.
        """
        if ef_search is None:
            yield
            return
        con.execute(f"SET hnsw_ef_search = {int(ef_search)};")
        try:
            yield
        finally:
            con.execute("RESET hnsw_ef_search;")

    async def similarity_search(
        self,
        text_or_texts: Union[str, List[str]],
        *,
        table_name: str,
        k: int = 10,
        threshold: Optional[float] = None,
        ef_search: Optional[int] = None,
        output_format: str = "json",
    ):
        """Vector similarity search against a single table.

        Args:
            text_or_texts: Query text, or list of query texts. Multiple
                queries are merged into a single ranked result set
                (best score per id kept).
            table_name: Target table.
            k: Maximum number of rows returned.
            threshold: Optional maximum vector distance — rows beyond
                this distance are dropped.
            ef_search: Optional override for HNSW's search-time
                candidate-list depth. ``None`` keeps the index-time
                value (or DuckDB's default of 64); higher = better
                recall at slower query time. Mirrors Ladybug's
                ``ef_search``.
            output_format: ``"json"`` (list of dicts, default — Python
                data) or ``"csv"`` (CSV string, more compact for LM
                input).
        """
        if not text_or_texts:
            return format_search_results([], output_format)
        if not self.embedding_model:
            raise ValueError(
                "similarity_search requires an embedding model on the adapter."
            )

        texts = [text_or_texts] if not isinstance(text_or_texts, list) else text_or_texts

        label = table_identifier(table_name)
        schema = self._duckdb_table_to_json_schema(label)
        id_key = self._get_id_key(schema)

        embeddings = await self.embedding_model(EmbeddingRequest(texts=texts))
        vectors = embeddings.get("embeddings")
        if vectors:
            await self._ensure_vector_dim(vectors[0])
        dist_fn = _VSS_DISTANCE_FN[self.metric]
        # Canonical score, shared with LanceDBAdapter. ``array_distance`` is the
        # (non-squared) Euclidean distance, so square it for "l2sq" to match
        # LanceDB's squared-L2; "cosine"/"ip" use the raw function. ``threshold``
        # is compared in the same units as the score (one ``?`` per occurrence).
        _d = f"{dist_fn}({self.vss_key}, ?::FLOAT[{self.vector_dim}])"
        score_expr = f"power({_d}, 2)" if self.metric == "l2sq" else _d

        if len(vectors) == 1:
            vector = vectors[0]
            where_clause = f"WHERE {score_expr} < ?" if threshold else ""
            sql = f"""
                SELECT *,
                    {score_expr} AS score
                FROM {label}
                {where_clause}
                ORDER BY score ASC
                LIMIT ?;
            """
            params = [vector]
            if threshold is not None:
                params.extend([vector, threshold])
            params.append(k)

            with (
                self._connect(read_only=True) as con,
                self._hnsw_ef_search_override(con, ef_search),
            ):
                try:
                    arrow_table = con.execute(sql, params).arrow().read_all()
                except Exception as e:
                    raise RuntimeError(f"Vector search failed for table '{label}': {e}")
            return format_search_results(arrow_table, output_format)

        # Multi-query: dedupe by id, keep best score, take top-k.
        results: Dict[Any, Dict[str, Any]] = {}
        with (
            self._connect(read_only=True) as con,
            self._hnsw_ef_search_override(con, ef_search),
        ):
            for vector in vectors:
                where_clause = f"WHERE {score_expr} < ?" if threshold else ""
                sql = f"""
                    SELECT *,
                        {score_expr} AS score
                    FROM {label}
                    {where_clause}
                    ORDER BY score ASC
                    LIMIT ?;
                """
                params = [vector]
                if threshold is not None:
                    params.extend([vector, threshold])
                params.append(k)

                try:
                    rows = con.execute(sql, params).arrow().read_all().to_pylist()
                except Exception as e:
                    raise RuntimeError(f"Vector search failed for table '{label}': {e}")

                for row in rows:
                    uid = row[id_key]
                    prev = results.get(uid)
                    if prev is None or row["score"] < prev["score"]:
                        results[uid] = row

        ranked = sorted(results.values(), key=lambda r: r["score"])[:k]
        return format_search_results(ranked, output_format)

    def _bm25_call(
        self,
        id_key: str,
        *,
        bm25_b: Optional[float] = None,
        bm25_k: Optional[float] = None,
        conjunctive: Optional[bool] = None,
    ) -> str:
        """Render a ``match_bm25(id, ?[, named-args])`` call fragment.

        The ``?`` placeholder is reserved for the query string (bound
        as a parameter by the caller). BM25 tuning knobs (``b``,
        ``k``, ``conjunctive``) are inlined via DuckDB's ``key := value``
        named-argument syntax so we don't have to specify the
        positional args we want to leave at the engine default. Only
        non-``None`` overrides are emitted.
        """
        extras: List[str] = []
        for key, value in (
            ("k", bm25_k),
            ("b", bm25_b),
            ("conjunctive", conjunctive),
        ):
            if value is None:
                continue
            extras.append(f"{key} := {self._render_sql_literal(value)}")
        suffix = (", " + ", ".join(extras)) if extras else ""
        return f"match_bm25({id_key}, ?{suffix})"

    async def fulltext_search(
        self,
        text_or_texts: Union[str, List[str]],
        *,
        table_name: str,
        k: int = 10,
        threshold: Optional[float] = None,
        conjunctive: bool = False,
        bm25_b: Optional[float] = None,
        bm25_k: Optional[float] = None,
        output_format: str = "json",
    ):
        """BM25 full-text search against a single table.

        Args:
            text_or_texts: Query text, or list of query texts. Multiple
                queries are merged (best BM25 per id kept).
            table_name: Target table.
            k: Maximum number of rows returned.
            threshold: Optional minimum relevance on the normalized
                ``[0, 1]`` scale. The returned result set is min-max
                scaled (best hit ``1.0``, worst ``0.0``) so ``score`` is
                comparable with the LanceDB adapter despite different raw
                BM25 ranges.
            conjunctive: AND-mode query (every term must match).
                Default ``False`` keeps OR semantics (DuckDB's
                ``conjunctive=0``). Mirrors Ladybug's ``conjunctive``.
            bm25_b: Optional override for BM25's ``b`` parameter
                (document-length normalization). ``None`` defers to
                DuckDB's default (0.75).
            bm25_k: Optional override for BM25's ``k1`` parameter
                (term-frequency saturation). ``None`` defers to
                DuckDB's default (1.2). Named ``bm25_k`` to avoid
                collision with the result-limit ``k`` above.
            output_format: ``"json"`` (list of dicts, default) or
                ``"csv"`` (CSV string).
        """
        if not text_or_texts:
            return format_search_results([], output_format)

        texts = [text_or_texts] if not isinstance(text_or_texts, list) else text_or_texts

        label = table_identifier(table_name)
        schema = self._duckdb_table_to_json_schema(label)
        id_key = self._get_id_key(schema)
        if not self._has_indexable_text_columns(schema):
            warnings.warn(f"Skipping FTS search for {label}: no text columns to index.")
            return format_search_results([], output_format)
        fts_table = sanitize_identifier(f"fts_main_{label}")
        # Build the BM25 call once; the named-arg fragments (b/k/
        # conjunctive) are inlined as literals, leaving the ``?`` for
        # the query string bound per iteration. ``conjunctive=False``
        # is DuckDB's default, so only ``True`` triggers an override.
        bm25_call = self._bm25_call(
            id_key,
            bm25_b=bm25_b,
            bm25_k=bm25_k,
            conjunctive=conjunctive or None,
        )

        sql = f"""
            SELECT t.*, fts.score
            FROM {label} t
            JOIN (
                SELECT
                    {id_key},
                    {fts_table}.{bm25_call} AS score
                FROM {label}
            ) fts ON t.{id_key} = fts.{id_key}
            WHERE fts.score IS NOT NULL
            ORDER BY fts.score DESC
            LIMIT ?;
        """

        # Fetch the top-k by raw BM25 (dedupe across queries, keep best), then
        # rescale to [0, 1] so scores are comparable with the LanceDB adapter.
        results: Dict[Any, Dict[str, Any]] = {}
        with self._connect(read_only=True) as con:
            for text in texts:
                try:
                    rows = con.execute(sql, [text, k]).arrow().read_all().to_pylist()
                except Exception as e:
                    raise RuntimeError(f"FTS query failed for table '{label}': {e}")
                for row in rows:
                    uid = row[id_key]
                    prev = results.get(uid)
                    if prev is None or row["score"] > prev["score"]:
                        results[uid] = row

        ranked = sorted(results.values(), key=lambda r: r["score"], reverse=True)[:k]
        # ``threshold`` filters on the same normalized [0, 1] scale as the score.
        minmax_normalize_scores(ranked, key="score")
        if threshold is not None:
            ranked = [r for r in ranked if r["score"] >= threshold]
        return format_search_results(ranked, output_format)

    async def regex_search(
        self,
        pattern: str,
        *,
        table_name: str,
        fields: Optional[List[str]] = None,
        case_sensitive: bool = True,
        k: int = 10,
        output_format: str = "json",
    ):
        """Find rows whose string fields match a regular expression.

        Uses DuckDB's ``regexp_matches`` under the hood. DuckDB ships
        RE2 (Google's regex library) so evaluation is linear-time — no
        catastrophic-backtracking exposure even if ``pattern`` comes
        from an untrusted source.

        Args:
            pattern: The regex pattern (RE2 syntax).
            table_name: Target table.
            fields: Field names to match against. Defaults to every
                string-typed field on the schema. Names not present
                as string columns are silently dropped.
            case_sensitive: When ``False``, matches case-insensitively.
            k: Maximum number of rows returned.
            output_format: ``"json"`` (list of dicts, default) or
                ``"csv"`` (CSV string).
        """
        if not pattern:
            return format_search_results([], output_format)

        label = table_identifier(table_name)
        schema = self._duckdb_table_to_json_schema(label)
        properties = schema.get("properties", {})

        string_cols = [
            column_identifier(name)
            for name, info in properties.items()
            if info.get("type") == "string"
        ]
        if fields is not None:
            requested = {column_identifier(f) for f in fields}
            cols = [c for c in string_cols if c in requested]
        else:
            cols = string_cols
        if not cols:
            warnings.warn(
                f"Skipping regex search for {label}: no matching string fields."
            )
            return format_search_results([], output_format)

        flag = "i" if not case_sensitive else ""
        where = " OR ".join(f"regexp_matches({c}, ?, ?)" for c in cols)
        sql = f"SELECT * FROM {label} WHERE {where} LIMIT ?;"
        params = []
        for _ in cols:
            params.extend([pattern, flag])
        params.append(k)

        with self._connect(read_only=True) as con:
            try:
                arrow_table = con.execute(sql, params).arrow().read_all()
            except Exception as e:
                raise RuntimeError(f"Regex query failed for table '{label}': {e}")
        return format_search_results(arrow_table, output_format)

    async def hybrid_search(self, *args, **kwargs):
        """Deprecated alias of `hybrid_fts_search`.

        Kept so call sites pre-dating the rename keep working. Prefer
        the new name in new code — it's symmetric with
        `hybrid_regex_search`.
        """
        return await self.hybrid_fts_search(*args, **kwargs)

    async def hybrid_fts_search(
        self,
        text_or_texts: Union[str, List[str]],
        *,
        keywords: Optional[Union[str, List[str]]] = None,
        table_name: str,
        k: int = 10,
        k_rank: int = 60,
        similarity_threshold: Optional[float] = None,
        fulltext_threshold: Optional[float] = None,
        ef_search: Optional[int] = None,
        conjunctive: bool = False,
        bm25_b: Optional[float] = None,
        bm25_k: Optional[float] = None,
        output_format: str = "json",
    ):
        """Reciprocal-Rank-Fusion of vector similarity + BM25 fulltext.

        Internally runs `similarity_search` and
        `fulltext_search` against the same ``table_name``, then
        fuses their rankings with the RRF formula
        ``sum(1 / (k_rank + rank))``. Falls back to pure FTS if no
        embedding model is configured.

        ``text_or_texts`` feeds the vector branch; ``keywords`` (when
        provided) feeds the BM25 branch instead — the two signals
        look for different things (semantic vs lexical) and the
        natural-language query that drives the vectors is usually
        not the keyword set you'd hand to BM25. When ``keywords`` is
        omitted, the text is reused for both branches so existing
        call sites keep working.

        Args:
            text_or_texts: Query text or list of query texts for the
                vector branch.
            table_name: Target table.
            keywords: Query text or list of query texts for the BM25
                branch. Aligns by position with ``text_or_texts``;
                when omitted, the text is reused.
            k: Maximum number of rows returned.
            k_rank: RRF smoothing constant (default 60). Lower values
                weight top-ranked rows more strongly.
            similarity_threshold: Optional maximum vector distance.
            fulltext_threshold: Optional minimum fulltext relevance on
                the normalized ``[0, 1]`` scale.
            ef_search: Forwarded to the vector branch
                (`similarity_search`); overrides HNSW's
                search-time candidate-list depth.
            conjunctive: Forwarded to the BM25 branch
                (`fulltext_search`); switches to AND-mode query.
            bm25_b: Forwarded to the BM25 branch; document-length
                normalization override.
            bm25_k: Forwarded to the BM25 branch; term-frequency
                saturation (``k1``) override.
            output_format: ``"json"`` (list of dicts, default) or
                ``"csv"`` (CSV string).
        """
        if not text_or_texts:
            return format_search_results([], output_format)

        if not self.embedding_model:
            # Fulltext-only fallback. Prefer explicit keywords when
            # the caller passed them — that's what the BM25 branch
            # would have used in the full hybrid path anyway. Tag
            # each row with ``rrf_score`` (set to the BM25 score) so
            # the result shape matches the full-hybrid path; callers
            # can always read ``rrf_score`` without branching.
            fts_rows = await self.fulltext_search(
                keywords if keywords is not None else text_or_texts,
                table_name=table_name,
                k=k,
                threshold=fulltext_threshold,
                conjunctive=conjunctive,
                bm25_b=bm25_b,
                bm25_k=bm25_k,
                output_format="json",
            )
            for row in fts_rows:
                row.setdefault("rrf_score", row.get("score", 0.0))
                row.setdefault("fulltext_score", row.get("score", 0.0))
            return format_search_results(fts_rows, output_format)

        queries = (
            [text_or_texts] if isinstance(text_or_texts, str) else list(text_or_texts)
        )
        if keywords is None:
            keyword_queries = list(queries)
        else:
            keyword_queries = [keywords] if isinstance(keywords, str) else list(keywords)
            if len(keyword_queries) != len(queries):
                raise ValueError(
                    f"`keywords` must align with `text_or_texts`: got "
                    f"{len(keyword_queries)} keyword(s) vs "
                    f"{len(queries)} text(s)."
                )

        label = table_identifier(table_name)
        schema = self._duckdb_table_to_json_schema(label)
        id_key = self._get_id_key(schema)

        final_results: Dict[Any, Dict[str, Any]] = {}

        for query_text, keyword_text in zip(queries, keyword_queries):
            try:
                try:
                    fts_results = await self.fulltext_search(
                        keyword_text,
                        table_name=label,
                        k=k * 5,
                        threshold=fulltext_threshold,
                        conjunctive=conjunctive,
                        bm25_b=bm25_b,
                        bm25_k=bm25_k,
                        output_format="json",
                    )
                except Exception:
                    fts_results = []
                try:
                    vss_results = await self.similarity_search(
                        query_text,
                        table_name=label,
                        k=k * 5,
                        threshold=similarity_threshold,
                        ef_search=ef_search,
                        output_format="json",
                    )
                except Exception:
                    vss_results = []

                if not fts_results and not vss_results:
                    warnings.warn(f"No results for query='{query_text}'.")
                    continue

                fts_rank = {r[id_key]: i + 1 for i, r in enumerate(fts_results)}
                vss_rank = {r[id_key]: i + 1 for i, r in enumerate(vss_results)}

                combined_rows: Dict[Any, Dict[str, Any]] = {}
                for row in fts_results + vss_results:
                    uid = row[id_key]
                    if uid not in combined_rows:
                        combined_rows[uid] = dict(row)
                    else:
                        combined_rows[uid].update(row)

                # RRF formula: sum(1 / (k_rank + rank))
                for uid in set(fts_rank) | set(vss_rank):
                    score = 0.0
                    if uid in fts_rank:
                        score += 1.0 / (k_rank + fts_rank[uid])
                    if uid in vss_rank:
                        score += 1.0 / (k_rank + vss_rank[uid])
                    combined_rows[uid]["score"] = score

                top_rows = heapq.nlargest(
                    k, combined_rows.values(), key=lambda r: r["score"]
                )
                for r in top_rows:
                    uid = r[id_key]
                    if (
                        uid not in final_results
                        or r["score"] > final_results[uid]["score"]
                    ):
                        final_results[uid] = r
            except Exception as e:
                warnings.warn(f"Hybrid search iteration failed: {e}")
                continue

        results_sorted = sorted(
            heapq.nlargest(k, final_results.values(), key=lambda r: r["score"]),
            key=lambda r: (-r["score"], r.get(id_key)),
        )
        return format_search_results(results_sorted, output_format)

    async def hybrid_regex_search(
        self,
        text_or_texts: Union[str, List[str]],
        *,
        pattern_or_patterns: Union[str, List[str], None] = None,
        table_name: str,
        k: int = 10,
        k_rank: int = 60,
        similarity_threshold: Optional[float] = None,
        ef_search: Optional[int] = None,
        fields: Optional[List[str]] = None,
        case_sensitive: bool = True,
        output_format: str = "json",
    ):
        """Reciprocal-Rank-Fusion of vector similarity + regex match.

        Sibling of `hybrid_fts_search`. The vector half embeds
        ``text_or_texts``; the regex half matches
        ``pattern_or_patterns`` against the table's string columns.
        Degenerates to plain `similarity_search` when no
        patterns are supplied, or to plain `regex_search` when
        no embedding model is configured.

        Args:
            text_or_texts: Query text (or list) for the vector half.
            pattern_or_patterns: RE2 pattern (or list) for the regex
                half. ``None`` skips the regex half.
            table_name: Target table.
            k: Maximum number of rows returned.
            k_rank: RRF smoothing constant.
            similarity_threshold: Optional maximum vector distance.
            ef_search: Forwarded to the vector branch
                (`similarity_search`); overrides HNSW's
                search-time candidate-list depth.
            fields: Forwarded to `regex_search`.
            case_sensitive: Forwarded to `regex_search`.
            output_format: ``"json"`` (list of dicts, default) or
                ``"csv"`` (CSV string).
        """
        if not text_or_texts and not pattern_or_patterns:
            return format_search_results([], output_format)

        if not self.embedding_model:
            if not pattern_or_patterns:
                return format_search_results([], output_format)
            patterns_list = (
                [pattern_or_patterns]
                if isinstance(pattern_or_patterns, str)
                else list(pattern_or_patterns)
            )
            merged: Dict[bytes, Dict[str, Any]] = {}
            for p in patterns_list:
                rows = await self.regex_search(
                    p,
                    table_name=table_name,
                    fields=fields,
                    case_sensitive=case_sensitive,
                    k=k,
                    output_format="json",
                )
                for r in rows:
                    sig = orjson.dumps(r, default=str)
                    merged[sig] = r
            return format_search_results(list(merged.values())[:k], output_format)

        texts = [text_or_texts] if isinstance(text_or_texts, str) else list(text_or_texts)
        if pattern_or_patterns is None:
            patterns: List[str] = []
        elif isinstance(pattern_or_patterns, str):
            patterns = [pattern_or_patterns]
        else:
            patterns = list(pattern_or_patterns)

        label = table_identifier(table_name)
        schema = self._duckdb_table_to_json_schema(label)
        id_key = self._get_id_key(schema)

        final_results: Dict[Any, Dict[str, Any]] = {}

        for query_text in texts:
            try:
                try:
                    vss_results = await self.similarity_search(
                        query_text,
                        table_name=label,
                        k=k * 5,
                        threshold=similarity_threshold,
                        ef_search=ef_search,
                        output_format="json",
                    )
                except Exception:
                    vss_results = []

                rx_results: List[Dict[str, Any]] = []
                if patterns:
                    seen_rx = set()
                    for pattern in patterns:
                        try:
                            rows = await self.regex_search(
                                pattern,
                                table_name=label,
                                fields=fields,
                                case_sensitive=case_sensitive,
                                k=k * 5,
                                output_format="json",
                            )
                        except Exception:
                            rows = []
                        for row in rows:
                            sig = orjson.dumps(row, default=str)
                            if sig not in seen_rx:
                                seen_rx.add(sig)
                                rx_results.append(row)

                if not vss_results and not rx_results:
                    warnings.warn(f"No results for query='{query_text}'.")
                    continue

                vss_rank = {r[id_key]: i + 1 for i, r in enumerate(vss_results)}
                rx_rank = {r[id_key]: i + 1 for i, r in enumerate(rx_results)}

                combined_rows: Dict[Any, Dict[str, Any]] = {}
                for row in vss_results + rx_results:
                    uid = row[id_key]
                    if uid not in combined_rows:
                        combined_rows[uid] = dict(row)
                    else:
                        combined_rows[uid].update(row)

                for uid in set(vss_rank) | set(rx_rank):
                    score = 0.0
                    if uid in vss_rank:
                        score += 1.0 / (k_rank + vss_rank[uid])
                    if uid in rx_rank:
                        score += 1.0 / (k_rank + rx_rank[uid])
                    combined_rows[uid]["score"] = score

                top_rows = heapq.nlargest(
                    k, combined_rows.values(), key=lambda r: r["score"]
                )
                for r in top_rows:
                    uid = r[id_key]
                    if (
                        uid not in final_results
                        or r["score"] > final_results[uid]["score"]
                    ):
                        final_results[uid] = r
            except Exception as e:
                warnings.warn(f"Hybrid-regex iteration failed: {e}")
                continue

        results_sorted = sorted(
            heapq.nlargest(k, final_results.values(), key=lambda r: r["score"]),
            key=lambda r: (-r["score"], r.get(id_key)),
        )
        return format_search_results(results_sorted, output_format)

close()

Close the adapter's cached connection. Idempotent.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
def close(self):
    """Close the adapter's cached connection. Idempotent."""
    if self._con is not None:
        try:
            self._con.close()
        except duckdb.Error:
            pass
        self._con = None

delete(id_or_ids, *, table_name) async

Delete records by primary key from a single table.

Rebuilds the FTS and HNSW indexes after the delete so subsequent search calls don't return ghost rows.

Parameters:

Name Type Description Default
id_or_ids Union[Any, List[Any]]

A single primary key value, or a list of values.

required
table_name str

Target table.

required

Returns:

Type Description
int

The number of rows actually deleted (0 if none matched or

int

if the table doesn't exist).

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def delete(
    self,
    id_or_ids: Union[Any, List[Any]],
    *,
    table_name: str,
) -> int:
    """Delete records by primary key from a single table.

    Rebuilds the FTS and HNSW indexes after the delete so
    subsequent search calls don't return ghost rows.

    Args:
        id_or_ids: A single primary key value, or a list of values.
        table_name: Target table.

    Returns:
        The number of rows actually deleted (0 if none matched or
        if the table doesn't exist).
    """
    ids = [id_or_ids] if not isinstance(id_or_ids, list) else list(id_or_ids)
    if not ids:
        return 0

    table = table_identifier(table_name)
    try:
        json_schema = self._duckdb_table_to_json_schema(table)
    except duckdb.Error:
        warnings.warn(f"delete(): no table named '{table}'; nothing to delete.")
        return 0
    id_key = self._get_id_key(json_schema)

    placeholders = ", ".join(["?"] * len(ids))
    sql = (
        f"DELETE FROM {table} WHERE {id_key} IN ({placeholders}) RETURNING {id_key};"
    )

    with self._connect(read_only=False) as con:
        try:
            rows = con.execute(sql, ids).fetchall()
        except Exception as e:
            raise RuntimeError(f"delete from '{table}' failed: {e}") from e

    deleted = len(rows)

    symbolic_model = self.data_models.get(table)
    if symbolic_model is None:
        symbolic_model = SymbolicDataModel(schema=json_schema, name=table)

    try:
        self._maybe_create_fulltext_index(symbolic_model)
    except Exception as e:
        warnings.warn(
            f"FTS index rebuild failed for '{table}' after delete; "
            f"fulltext_search results may be stale. ({e})"
        )
    try:
        self._maybe_create_vector_index(symbolic_model)
    except Exception as e:
        warnings.warn(
            f"Vector index rebuild failed for '{table}' after "
            f"delete; similarity_search will fall back to scan. ({e})"
        )

    return deleted

drop_table(table_name) async

Drop a table and its associated FTS / HNSW indexes.

Also removes the table from the adapter's known-tables registry so subsequent operations stop seeing it.

Parameters:

Name Type Description Default
table_name str

Target table.

required

Returns:

Type Description
bool

True if a table was dropped, False if no such

bool

table existed.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def drop_table(self, table_name: str) -> bool:
    """Drop a table and its associated FTS / HNSW indexes.

    Also removes the table from the adapter's known-tables
    registry so subsequent operations stop seeing it.

    Args:
        table_name: Target table.

    Returns:
        ``True`` if a table was dropped, ``False`` if no such
        table existed.
    """
    table = table_identifier(table_name)

    with self._connect(read_only=True) as con:
        exists = con.execute(
            "SELECT COUNT(*) FROM information_schema.tables "
            f"WHERE table_schema='{MAIN_TABLE}' AND table_name=?",
            [table],
        ).fetchone()[0]
    if not exists:
        return False

    with self._connect(read_only=False) as con:
        # FTS schema doesn't cascade-drop with the table.
        try:
            con.execute(f"PRAGMA drop_fts_index('main.{table}');")
        except duckdb.Error:
            pass

        vector_index = f"vector_main_{table}"
        con.execute(f"DROP INDEX IF EXISTS {vector_index};")

        con.execute(f"DROP TABLE IF EXISTS {table};")

    self.data_models.pop(table, None)
    return True

from_csv(path, *, table_name=None, table_description=None, delimiter=',', encoding='utf-8', header=True) async

Bulk-load a CSV file directly into a new (or existing) table.

Uses DuckDB's native read_csv so column types are auto-detected from the file (with the conservative bias that zero-padded ids like "00123" stay VARCHAR). The first column is promoted to PRIMARY KEY. Column names are normalized to snake_case; the table name to PascalCase.

Use this over update(CSVDataset(...)) for non-trivial files — the bulk path is orders of magnitude faster because it bypasses the per-row Pydantic / Python pipeline. Prefer update when source rows need transformation before storage.

Parameters:

Name Type Description Default
path str

Path to the CSV file.

required
table_name Optional[str]

Target table name. Defaults to the file's stem (/data/my-docs.csvMyDocs). Always normalized to PascalCase.

None
table_description Optional[str]

Optional natural-language description attached to the resulting schema's top-level description field.

None
delimiter str

Field delimiter. Defaults to ",".

','
encoding str

File encoding. Defaults to "utf-8".

'utf-8'
header bool

Whether the first row is a header. Defaults to True.

True

Returns:

Type Description
SymbolicDataModel

The SymbolicDataModel for the loaded table. Pass

SymbolicDataModel

it to fulltext_search / similarity_search / get

SymbolicDataModel

to query the data.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def from_csv(
    self,
    path: str,
    *,
    table_name: Optional[str] = None,
    table_description: Optional[str] = None,
    delimiter: str = ",",
    encoding: str = "utf-8",
    header: bool = True,
) -> SymbolicDataModel:
    """Bulk-load a CSV file directly into a new (or existing) table.

    Uses DuckDB's native ``read_csv`` so column types are
    auto-detected from the file (with the conservative bias that
    zero-padded ids like ``"00123"`` stay ``VARCHAR``). The first
    column is promoted to ``PRIMARY KEY``. Column names are
    normalized to ``snake_case``; the table name to ``PascalCase``.

    Use this over ``update(CSVDataset(...))`` for non-trivial files
    — the bulk path is orders of magnitude faster because it
    bypasses the per-row Pydantic / Python pipeline. Prefer
    ``update`` when source rows need transformation before storage.

    Args:
        path: Path to the CSV file.
        table_name: Target table name. Defaults to the file's stem
            (``/data/my-docs.csv`` → ``MyDocs``). Always normalized
            to PascalCase.
        table_description: Optional natural-language description
            attached to the resulting schema's top-level
            ``description`` field.
        delimiter: Field delimiter. Defaults to ``","``.
        encoding: File encoding. Defaults to ``"utf-8"``.
        header: Whether the first row is a header. Defaults to
            ``True``.

    Returns:
        The `SymbolicDataModel` for the loaded table. Pass
        it to ``fulltext_search`` / ``similarity_search`` / ``get``
        to query the data.
    """
    reader_kwargs = {
        "delim": f"'{delimiter.replace(chr(39), chr(39) * 2)}'",
        "header": "true" if header else "false",
        "encoding": f"'{encoding.replace(chr(39), chr(39) * 2)}'",
    }
    return await self._bulk_load(
        path,
        table_name=table_name,
        table_description=table_description,
        reader_fn="read_csv",
        reader_kwargs=reader_kwargs,
    )

from_json(path, *, table_name=None, table_description=None) async

Bulk-load a JSON file (top-level array of objects).

Same fast-path trade-offs as from_csv / from_parquet. Use from_jsonl for one-object- per-line NDJSON sources.

Parameters:

Name Type Description Default
path str

Path to the JSON file. Must contain a top-level array of objects, e.g. [{"id": "a", "text": "..."}, ...].

required
table_name Optional[str]

Target table name. Defaults to the file's stem, PascalCase-normalized.

None
table_description Optional[str]

Optional schema description.

None

Returns:

Type Description
SymbolicDataModel

The SymbolicDataModel for the loaded table.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def from_json(
    self,
    path: str,
    *,
    table_name: Optional[str] = None,
    table_description: Optional[str] = None,
) -> SymbolicDataModel:
    """Bulk-load a JSON file (top-level array of objects).

    Same fast-path trade-offs as `from_csv` /
    `from_parquet`. Use `from_jsonl` for one-object-
    per-line NDJSON sources.

    Args:
        path: Path to the JSON file. Must contain a top-level array
            of objects, e.g. ``[{"id": "a", "text": "..."}, ...]``.
        table_name: Target table name. Defaults to the file's stem,
            PascalCase-normalized.
        table_description: Optional schema description.

    Returns:
        The `SymbolicDataModel` for the loaded table.
    """
    return await self._bulk_load(
        path,
        table_name=table_name,
        table_description=table_description,
        reader_fn="read_json",
        reader_kwargs={"format": "'array'"},
    )

from_jsonl(path, *, table_name=None, table_description=None) async

Bulk-load a JSON Lines (NDJSON) file.

Same fast-path trade-offs as from_csv / from_parquet. Use this for very large JSON sources that aren't a single array.

Parameters:

Name Type Description Default
path str

Path to the JSONL file.

required
table_name Optional[str]

Target table name. Defaults to the file's stem, PascalCase-normalized.

None
table_description Optional[str]

Optional schema description.

None

Returns:

Type Description
SymbolicDataModel

The SymbolicDataModel for the loaded table.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def from_jsonl(
    self,
    path: str,
    *,
    table_name: Optional[str] = None,
    table_description: Optional[str] = None,
) -> SymbolicDataModel:
    """Bulk-load a JSON Lines (NDJSON) file.

    Same fast-path trade-offs as `from_csv` /
    `from_parquet`. Use this for very large JSON sources that
    aren't a single array.

    Args:
        path: Path to the JSONL file.
        table_name: Target table name. Defaults to the file's stem,
            PascalCase-normalized.
        table_description: Optional schema description.

    Returns:
        The `SymbolicDataModel` for the loaded table.
    """
    return await self._bulk_load(
        path,
        table_name=table_name,
        table_description=table_description,
        reader_fn="read_json",
        reader_kwargs={"format": "'newline_delimited'"},
    )

from_parquet(path, *, table_name=None, table_description=None) async

Bulk-load a Parquet file directly into a new (or existing) table.

Same fast-path trade-offs as from_csv — bypasses the Python row pipeline for native DuckDB ingestion. Parquet's schema is explicit in the file's footer, so types are preserved end-to-end without auto-detection guesswork.

Parameters:

Name Type Description Default
path str

Path to the Parquet file.

required
table_name Optional[str]

Target table name. Defaults to the file's stem, PascalCase-normalized.

None
table_description Optional[str]

Optional schema description.

None

Returns:

Type Description
SymbolicDataModel

The SymbolicDataModel for the loaded table.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def from_parquet(
    self,
    path: str,
    *,
    table_name: Optional[str] = None,
    table_description: Optional[str] = None,
) -> SymbolicDataModel:
    """Bulk-load a Parquet file directly into a new (or existing) table.

    Same fast-path trade-offs as `from_csv` — bypasses the
    Python row pipeline for native DuckDB ingestion. Parquet's
    schema is explicit in the file's footer, so types are
    preserved end-to-end without auto-detection guesswork.

    Args:
        path: Path to the Parquet file.
        table_name: Target table name. Defaults to the file's stem,
            PascalCase-normalized.
        table_description: Optional schema description.

    Returns:
        The `SymbolicDataModel` for the loaded table.
    """
    return await self._bulk_load(
        path,
        table_name=table_name,
        table_description=table_description,
        reader_fn="read_parquet",
        reader_kwargs={},
    )

BM25 full-text search against a single table.

Parameters:

Name Type Description Default
text_or_texts Union[str, List[str]]

Query text, or list of query texts. Multiple queries are merged (best BM25 per id kept).

required
table_name str

Target table.

required
k int

Maximum number of rows returned.

10
threshold Optional[float]

Optional minimum relevance on the normalized [0, 1] scale. The returned result set is min-max scaled (best hit 1.0, worst 0.0) so score is comparable with the LanceDB adapter despite different raw BM25 ranges.

None
conjunctive bool

AND-mode query (every term must match). Default False keeps OR semantics (DuckDB's conjunctive=0). Mirrors Ladybug's conjunctive.

False
bm25_b Optional[float]

Optional override for BM25's b parameter (document-length normalization). None defers to DuckDB's default (0.75).

None
bm25_k Optional[float]

Optional override for BM25's k1 parameter (term-frequency saturation). None defers to DuckDB's default (1.2). Named bm25_k to avoid collision with the result-limit k above.

None
output_format str

"json" (list of dicts, default) or "csv" (CSV string).

'json'
Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def fulltext_search(
    self,
    text_or_texts: Union[str, List[str]],
    *,
    table_name: str,
    k: int = 10,
    threshold: Optional[float] = None,
    conjunctive: bool = False,
    bm25_b: Optional[float] = None,
    bm25_k: Optional[float] = None,
    output_format: str = "json",
):
    """BM25 full-text search against a single table.

    Args:
        text_or_texts: Query text, or list of query texts. Multiple
            queries are merged (best BM25 per id kept).
        table_name: Target table.
        k: Maximum number of rows returned.
        threshold: Optional minimum relevance on the normalized
            ``[0, 1]`` scale. The returned result set is min-max
            scaled (best hit ``1.0``, worst ``0.0``) so ``score`` is
            comparable with the LanceDB adapter despite different raw
            BM25 ranges.
        conjunctive: AND-mode query (every term must match).
            Default ``False`` keeps OR semantics (DuckDB's
            ``conjunctive=0``). Mirrors Ladybug's ``conjunctive``.
        bm25_b: Optional override for BM25's ``b`` parameter
            (document-length normalization). ``None`` defers to
            DuckDB's default (0.75).
        bm25_k: Optional override for BM25's ``k1`` parameter
            (term-frequency saturation). ``None`` defers to
            DuckDB's default (1.2). Named ``bm25_k`` to avoid
            collision with the result-limit ``k`` above.
        output_format: ``"json"`` (list of dicts, default) or
            ``"csv"`` (CSV string).
    """
    if not text_or_texts:
        return format_search_results([], output_format)

    texts = [text_or_texts] if not isinstance(text_or_texts, list) else text_or_texts

    label = table_identifier(table_name)
    schema = self._duckdb_table_to_json_schema(label)
    id_key = self._get_id_key(schema)
    if not self._has_indexable_text_columns(schema):
        warnings.warn(f"Skipping FTS search for {label}: no text columns to index.")
        return format_search_results([], output_format)
    fts_table = sanitize_identifier(f"fts_main_{label}")
    # Build the BM25 call once; the named-arg fragments (b/k/
    # conjunctive) are inlined as literals, leaving the ``?`` for
    # the query string bound per iteration. ``conjunctive=False``
    # is DuckDB's default, so only ``True`` triggers an override.
    bm25_call = self._bm25_call(
        id_key,
        bm25_b=bm25_b,
        bm25_k=bm25_k,
        conjunctive=conjunctive or None,
    )

    sql = f"""
        SELECT t.*, fts.score
        FROM {label} t
        JOIN (
            SELECT
                {id_key},
                {fts_table}.{bm25_call} AS score
            FROM {label}
        ) fts ON t.{id_key} = fts.{id_key}
        WHERE fts.score IS NOT NULL
        ORDER BY fts.score DESC
        LIMIT ?;
    """

    # Fetch the top-k by raw BM25 (dedupe across queries, keep best), then
    # rescale to [0, 1] so scores are comparable with the LanceDB adapter.
    results: Dict[Any, Dict[str, Any]] = {}
    with self._connect(read_only=True) as con:
        for text in texts:
            try:
                rows = con.execute(sql, [text, k]).arrow().read_all().to_pylist()
            except Exception as e:
                raise RuntimeError(f"FTS query failed for table '{label}': {e}")
            for row in rows:
                uid = row[id_key]
                prev = results.get(uid)
                if prev is None or row["score"] > prev["score"]:
                    results[uid] = row

    ranked = sorted(results.values(), key=lambda r: r["score"], reverse=True)[:k]
    # ``threshold`` filters on the same normalized [0, 1] scale as the score.
    minmax_normalize_scores(ranked, key="score")
    if threshold is not None:
        ranked = [r for r in ranked if r["score"] >= threshold]
    return format_search_results(ranked, output_format)

get(id_or_ids, *, table_name, remove_embedding=True) async

Look up one or more records by primary key in a single table.

Parameters:

Name Type Description Default
id_or_ids Union[Any, List[Any]]

A single primary key value, or a list of values.

required
table_name str

Target table.

required
remove_embedding bool

Strip the embedding column from the returned records. Defaults to True to keep results LM-friendly.

True

Returns:

Type Description
Union[Optional[JsonDataModel], List[Optional[JsonDataModel]]]

For a scalar id: the matching JsonDataModel, or None

Union[Optional[JsonDataModel], List[Optional[JsonDataModel]]]

if not found. For a list of ids: a list in the same order,

Union[Optional[JsonDataModel], List[Optional[JsonDataModel]]]

with None in the slots that didn't match.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def get(
    self,
    id_or_ids: Union[Any, List[Any]],
    *,
    table_name: str,
    remove_embedding: bool = True,
) -> Union[Optional[JsonDataModel], List[Optional[JsonDataModel]]]:
    """Look up one or more records by primary key in a single table.

    Args:
        id_or_ids: A single primary key value, or a list of values.
        table_name: Target table.
        remove_embedding: Strip the embedding column from the
            returned records. Defaults to ``True`` to keep results
            LM-friendly.

    Returns:
        For a scalar id: the matching ``JsonDataModel``, or ``None``
        if not found. For a list of ids: a list in the same order,
        with ``None`` in the slots that didn't match.
    """
    return_single = not isinstance(id_or_ids, list)
    ids = [id_or_ids] if return_single else list(id_or_ids)

    if not ids:
        return None if return_single else []

    table = table_identifier(table_name)
    json_schema = self._duckdb_table_to_json_schema(table)
    id_key = self._get_id_key(json_schema)

    results: List[Optional[JsonDataModel]] = [None] * len(ids)

    with self._connect(read_only=True) as con:
        placeholders = ", ".join(["?"] * len(ids))
        try:
            sql = f"SELECT * FROM {table} WHERE {id_key} IN ({placeholders})"
            cursor = con.execute(sql, ids)
        except Exception as e:
            warnings.warn(f"get(): SELECT from '{table}' failed. ({e})")
            return None if return_single else results

        rows = cursor.arrow().read_all().to_pylist()
        if not rows:
            return None if return_single else results

        json_columns = _get_json_columns_from_schema(json_schema)
        rows_by_id = {row[id_key]: row for row in rows}

        for idx, id_val in enumerate(ids):
            row = rows_by_id.get(id_val)
            if row is None:
                continue
            json_data = _parse_json_columns(row, json_columns)
            if remove_embedding and self.vss_key in json_data:
                json_data.pop(self.vss_key)
            results[idx] = JsonDataModel(
                json=json_data,
                schema=json_schema,
                name=str(json_data.get(id_key)),
            )

    return results[0] if return_single else results

get_symbolic_data_models(remove_embedding=True)

Reflect every table in the main schema into a symbolic model.

Parameters:

Name Type Description Default
remove_embedding bool

Strip the embedding column from each reflected schema. Defaults to True.

True

Returns:

Type Description
List[SymbolicDataModel]

List[SymbolicDataModel]: One SymbolicDataModel per table, representing the current database schema.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
def get_symbolic_data_models(
    self,
    remove_embedding=True,
) -> List[SymbolicDataModel]:
    """Reflect every table in the ``main`` schema into a symbolic model.

    Args:
        remove_embedding (bool): Strip the embedding column from each
            reflected schema. Defaults to ``True``.

    Returns:
        List[SymbolicDataModel]: One ``SymbolicDataModel`` per table,
            representing the current database schema.
    """
    with self._connect(read_only=True) as con:
        tables = con.execute("""
            SELECT table_name 
            FROM information_schema.tables 
            WHERE table_schema='main';
        """).fetchall()

        symbolic_data_models = []
        for (table_name,) in tables:
            schema = self._duckdb_table_to_json_schema(table_name)
            model = SymbolicDataModel(schema=schema, name=table_name)
            symbolic_data_models.append(model)
        return symbolic_data_models

getall(*, table_name, limit=50, offset=0, remove_embedding=True) async

List rows from a single table, paginated.

Returns an empty list (with a warning) if the table doesn't exist, so callers can safely enumerate without pre-checking.

Parameters:

Name Type Description Default
table_name str

Target table.

required
limit int

Maximum number of records to return.

50
offset int

Number of records to skip.

0
remove_embedding bool

Strip the embedding column from results.

True

Returns:

Type Description
List[JsonDataModel]

A list of JsonDataModel records.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def getall(
    self,
    *,
    table_name: str,
    limit: int = 50,
    offset: int = 0,
    remove_embedding: bool = True,
) -> List[JsonDataModel]:
    """List rows from a single table, paginated.

    Returns an empty list (with a warning) if the table doesn't
    exist, so callers can safely enumerate without pre-checking.

    Args:
        table_name: Target table.
        limit: Maximum number of records to return.
        offset: Number of records to skip.
        remove_embedding: Strip the embedding column from results.

    Returns:
        A list of `JsonDataModel` records.
    """
    table = table_identifier(table_name)
    try:
        json_schema = self._duckdb_table_to_json_schema(table)
    except duckdb.Error as e:
        warnings.warn(f"Failed to read table '{table}': {e}")
        return []
    id_key = self._get_id_key(json_schema)

    with self._connect(read_only=True) as con:
        sql = f"SELECT * FROM {table} LIMIT ? OFFSET ?"
        try:
            cursor = con.execute(sql, [limit, offset])
            rows = cursor.arrow().read_all().to_pylist()
        except duckdb.Error as e:
            warnings.warn(f"Failed to read table '{table}': {e}")
            return []

        if not rows:
            return []

        json_columns = _get_json_columns_from_schema(json_schema)

        results = []
        for row in rows:
            json_data = _parse_json_columns(row, json_columns)
            if remove_embedding and self.vss_key in json_data:
                json_data.pop(self.vss_key)
            results.append(
                JsonDataModel(
                    json=json_data,
                    schema=json_schema,
                    name=str(json_data.get(id_key)),
                )
            )
        return results

Reciprocal-Rank-Fusion of vector similarity + BM25 fulltext.

Internally runs similarity_search and fulltext_search against the same table_name, then fuses their rankings with the RRF formula sum(1 / (k_rank + rank)). Falls back to pure FTS if no embedding model is configured.

text_or_texts feeds the vector branch; keywords (when provided) feeds the BM25 branch instead — the two signals look for different things (semantic vs lexical) and the natural-language query that drives the vectors is usually not the keyword set you'd hand to BM25. When keywords is omitted, the text is reused for both branches so existing call sites keep working.

Parameters:

Name Type Description Default
text_or_texts Union[str, List[str]]

Query text or list of query texts for the vector branch.

required
table_name str

Target table.

required
keywords Optional[Union[str, List[str]]]

Query text or list of query texts for the BM25 branch. Aligns by position with text_or_texts; when omitted, the text is reused.

None
k int

Maximum number of rows returned.

10
k_rank int

RRF smoothing constant (default 60). Lower values weight top-ranked rows more strongly.

60
similarity_threshold Optional[float]

Optional maximum vector distance.

None
fulltext_threshold Optional[float]

Optional minimum fulltext relevance on the normalized [0, 1] scale.

None
ef_search Optional[int]

Forwarded to the vector branch (similarity_search); overrides HNSW's search-time candidate-list depth.

None
conjunctive bool

Forwarded to the BM25 branch (fulltext_search); switches to AND-mode query.

False
bm25_b Optional[float]

Forwarded to the BM25 branch; document-length normalization override.

None
bm25_k Optional[float]

Forwarded to the BM25 branch; term-frequency saturation (k1) override.

None
output_format str

"json" (list of dicts, default) or "csv" (CSV string).

'json'
Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def hybrid_fts_search(
    self,
    text_or_texts: Union[str, List[str]],
    *,
    keywords: Optional[Union[str, List[str]]] = None,
    table_name: str,
    k: int = 10,
    k_rank: int = 60,
    similarity_threshold: Optional[float] = None,
    fulltext_threshold: Optional[float] = None,
    ef_search: Optional[int] = None,
    conjunctive: bool = False,
    bm25_b: Optional[float] = None,
    bm25_k: Optional[float] = None,
    output_format: str = "json",
):
    """Reciprocal-Rank-Fusion of vector similarity + BM25 fulltext.

    Internally runs `similarity_search` and
    `fulltext_search` against the same ``table_name``, then
    fuses their rankings with the RRF formula
    ``sum(1 / (k_rank + rank))``. Falls back to pure FTS if no
    embedding model is configured.

    ``text_or_texts`` feeds the vector branch; ``keywords`` (when
    provided) feeds the BM25 branch instead — the two signals
    look for different things (semantic vs lexical) and the
    natural-language query that drives the vectors is usually
    not the keyword set you'd hand to BM25. When ``keywords`` is
    omitted, the text is reused for both branches so existing
    call sites keep working.

    Args:
        text_or_texts: Query text or list of query texts for the
            vector branch.
        table_name: Target table.
        keywords: Query text or list of query texts for the BM25
            branch. Aligns by position with ``text_or_texts``;
            when omitted, the text is reused.
        k: Maximum number of rows returned.
        k_rank: RRF smoothing constant (default 60). Lower values
            weight top-ranked rows more strongly.
        similarity_threshold: Optional maximum vector distance.
        fulltext_threshold: Optional minimum fulltext relevance on
            the normalized ``[0, 1]`` scale.
        ef_search: Forwarded to the vector branch
            (`similarity_search`); overrides HNSW's
            search-time candidate-list depth.
        conjunctive: Forwarded to the BM25 branch
            (`fulltext_search`); switches to AND-mode query.
        bm25_b: Forwarded to the BM25 branch; document-length
            normalization override.
        bm25_k: Forwarded to the BM25 branch; term-frequency
            saturation (``k1``) override.
        output_format: ``"json"`` (list of dicts, default) or
            ``"csv"`` (CSV string).
    """
    if not text_or_texts:
        return format_search_results([], output_format)

    if not self.embedding_model:
        # Fulltext-only fallback. Prefer explicit keywords when
        # the caller passed them — that's what the BM25 branch
        # would have used in the full hybrid path anyway. Tag
        # each row with ``rrf_score`` (set to the BM25 score) so
        # the result shape matches the full-hybrid path; callers
        # can always read ``rrf_score`` without branching.
        fts_rows = await self.fulltext_search(
            keywords if keywords is not None else text_or_texts,
            table_name=table_name,
            k=k,
            threshold=fulltext_threshold,
            conjunctive=conjunctive,
            bm25_b=bm25_b,
            bm25_k=bm25_k,
            output_format="json",
        )
        for row in fts_rows:
            row.setdefault("rrf_score", row.get("score", 0.0))
            row.setdefault("fulltext_score", row.get("score", 0.0))
        return format_search_results(fts_rows, output_format)

    queries = (
        [text_or_texts] if isinstance(text_or_texts, str) else list(text_or_texts)
    )
    if keywords is None:
        keyword_queries = list(queries)
    else:
        keyword_queries = [keywords] if isinstance(keywords, str) else list(keywords)
        if len(keyword_queries) != len(queries):
            raise ValueError(
                f"`keywords` must align with `text_or_texts`: got "
                f"{len(keyword_queries)} keyword(s) vs "
                f"{len(queries)} text(s)."
            )

    label = table_identifier(table_name)
    schema = self._duckdb_table_to_json_schema(label)
    id_key = self._get_id_key(schema)

    final_results: Dict[Any, Dict[str, Any]] = {}

    for query_text, keyword_text in zip(queries, keyword_queries):
        try:
            try:
                fts_results = await self.fulltext_search(
                    keyword_text,
                    table_name=label,
                    k=k * 5,
                    threshold=fulltext_threshold,
                    conjunctive=conjunctive,
                    bm25_b=bm25_b,
                    bm25_k=bm25_k,
                    output_format="json",
                )
            except Exception:
                fts_results = []
            try:
                vss_results = await self.similarity_search(
                    query_text,
                    table_name=label,
                    k=k * 5,
                    threshold=similarity_threshold,
                    ef_search=ef_search,
                    output_format="json",
                )
            except Exception:
                vss_results = []

            if not fts_results and not vss_results:
                warnings.warn(f"No results for query='{query_text}'.")
                continue

            fts_rank = {r[id_key]: i + 1 for i, r in enumerate(fts_results)}
            vss_rank = {r[id_key]: i + 1 for i, r in enumerate(vss_results)}

            combined_rows: Dict[Any, Dict[str, Any]] = {}
            for row in fts_results + vss_results:
                uid = row[id_key]
                if uid not in combined_rows:
                    combined_rows[uid] = dict(row)
                else:
                    combined_rows[uid].update(row)

            # RRF formula: sum(1 / (k_rank + rank))
            for uid in set(fts_rank) | set(vss_rank):
                score = 0.0
                if uid in fts_rank:
                    score += 1.0 / (k_rank + fts_rank[uid])
                if uid in vss_rank:
                    score += 1.0 / (k_rank + vss_rank[uid])
                combined_rows[uid]["score"] = score

            top_rows = heapq.nlargest(
                k, combined_rows.values(), key=lambda r: r["score"]
            )
            for r in top_rows:
                uid = r[id_key]
                if (
                    uid not in final_results
                    or r["score"] > final_results[uid]["score"]
                ):
                    final_results[uid] = r
        except Exception as e:
            warnings.warn(f"Hybrid search iteration failed: {e}")
            continue

    results_sorted = sorted(
        heapq.nlargest(k, final_results.values(), key=lambda r: r["score"]),
        key=lambda r: (-r["score"], r.get(id_key)),
    )
    return format_search_results(results_sorted, output_format)

Reciprocal-Rank-Fusion of vector similarity + regex match.

Sibling of hybrid_fts_search. The vector half embeds text_or_texts; the regex half matches pattern_or_patterns against the table's string columns. Degenerates to plain similarity_search when no patterns are supplied, or to plain regex_search when no embedding model is configured.

Parameters:

Name Type Description Default
text_or_texts Union[str, List[str]]

Query text (or list) for the vector half.

required
pattern_or_patterns Union[str, List[str], None]

RE2 pattern (or list) for the regex half. None skips the regex half.

None
table_name str

Target table.

required
k int

Maximum number of rows returned.

10
k_rank int

RRF smoothing constant.

60
similarity_threshold Optional[float]

Optional maximum vector distance.

None
ef_search Optional[int]

Forwarded to the vector branch (similarity_search); overrides HNSW's search-time candidate-list depth.

None
fields Optional[List[str]]

Forwarded to regex_search.

None
case_sensitive bool

Forwarded to regex_search.

True
output_format str

"json" (list of dicts, default) or "csv" (CSV string).

'json'
Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def hybrid_regex_search(
    self,
    text_or_texts: Union[str, List[str]],
    *,
    pattern_or_patterns: Union[str, List[str], None] = None,
    table_name: str,
    k: int = 10,
    k_rank: int = 60,
    similarity_threshold: Optional[float] = None,
    ef_search: Optional[int] = None,
    fields: Optional[List[str]] = None,
    case_sensitive: bool = True,
    output_format: str = "json",
):
    """Reciprocal-Rank-Fusion of vector similarity + regex match.

    Sibling of `hybrid_fts_search`. The vector half embeds
    ``text_or_texts``; the regex half matches
    ``pattern_or_patterns`` against the table's string columns.
    Degenerates to plain `similarity_search` when no
    patterns are supplied, or to plain `regex_search` when
    no embedding model is configured.

    Args:
        text_or_texts: Query text (or list) for the vector half.
        pattern_or_patterns: RE2 pattern (or list) for the regex
            half. ``None`` skips the regex half.
        table_name: Target table.
        k: Maximum number of rows returned.
        k_rank: RRF smoothing constant.
        similarity_threshold: Optional maximum vector distance.
        ef_search: Forwarded to the vector branch
            (`similarity_search`); overrides HNSW's
            search-time candidate-list depth.
        fields: Forwarded to `regex_search`.
        case_sensitive: Forwarded to `regex_search`.
        output_format: ``"json"`` (list of dicts, default) or
            ``"csv"`` (CSV string).
    """
    if not text_or_texts and not pattern_or_patterns:
        return format_search_results([], output_format)

    if not self.embedding_model:
        if not pattern_or_patterns:
            return format_search_results([], output_format)
        patterns_list = (
            [pattern_or_patterns]
            if isinstance(pattern_or_patterns, str)
            else list(pattern_or_patterns)
        )
        merged: Dict[bytes, Dict[str, Any]] = {}
        for p in patterns_list:
            rows = await self.regex_search(
                p,
                table_name=table_name,
                fields=fields,
                case_sensitive=case_sensitive,
                k=k,
                output_format="json",
            )
            for r in rows:
                sig = orjson.dumps(r, default=str)
                merged[sig] = r
        return format_search_results(list(merged.values())[:k], output_format)

    texts = [text_or_texts] if isinstance(text_or_texts, str) else list(text_or_texts)
    if pattern_or_patterns is None:
        patterns: List[str] = []
    elif isinstance(pattern_or_patterns, str):
        patterns = [pattern_or_patterns]
    else:
        patterns = list(pattern_or_patterns)

    label = table_identifier(table_name)
    schema = self._duckdb_table_to_json_schema(label)
    id_key = self._get_id_key(schema)

    final_results: Dict[Any, Dict[str, Any]] = {}

    for query_text in texts:
        try:
            try:
                vss_results = await self.similarity_search(
                    query_text,
                    table_name=label,
                    k=k * 5,
                    threshold=similarity_threshold,
                    ef_search=ef_search,
                    output_format="json",
                )
            except Exception:
                vss_results = []

            rx_results: List[Dict[str, Any]] = []
            if patterns:
                seen_rx = set()
                for pattern in patterns:
                    try:
                        rows = await self.regex_search(
                            pattern,
                            table_name=label,
                            fields=fields,
                            case_sensitive=case_sensitive,
                            k=k * 5,
                            output_format="json",
                        )
                    except Exception:
                        rows = []
                    for row in rows:
                        sig = orjson.dumps(row, default=str)
                        if sig not in seen_rx:
                            seen_rx.add(sig)
                            rx_results.append(row)

            if not vss_results and not rx_results:
                warnings.warn(f"No results for query='{query_text}'.")
                continue

            vss_rank = {r[id_key]: i + 1 for i, r in enumerate(vss_results)}
            rx_rank = {r[id_key]: i + 1 for i, r in enumerate(rx_results)}

            combined_rows: Dict[Any, Dict[str, Any]] = {}
            for row in vss_results + rx_results:
                uid = row[id_key]
                if uid not in combined_rows:
                    combined_rows[uid] = dict(row)
                else:
                    combined_rows[uid].update(row)

            for uid in set(vss_rank) | set(rx_rank):
                score = 0.0
                if uid in vss_rank:
                    score += 1.0 / (k_rank + vss_rank[uid])
                if uid in rx_rank:
                    score += 1.0 / (k_rank + rx_rank[uid])
                combined_rows[uid]["score"] = score

            top_rows = heapq.nlargest(
                k, combined_rows.values(), key=lambda r: r["score"]
            )
            for r in top_rows:
                uid = r[id_key]
                if (
                    uid not in final_results
                    or r["score"] > final_results[uid]["score"]
                ):
                    final_results[uid] = r
        except Exception as e:
            warnings.warn(f"Hybrid-regex iteration failed: {e}")
            continue

    results_sorted = sorted(
        heapq.nlargest(k, final_results.values(), key=lambda r: r["score"]),
        key=lambda r: (-r["score"], r.get(id_key)),
    )
    return format_search_results(results_sorted, output_format)

Deprecated alias of hybrid_fts_search.

Kept so call sites pre-dating the rename keep working. Prefer the new name in new code — it's symmetric with hybrid_regex_search.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def hybrid_search(self, *args, **kwargs):
    """Deprecated alias of `hybrid_fts_search`.

    Kept so call sites pre-dating the rename keep working. Prefer
    the new name in new code — it's symmetric with
    `hybrid_regex_search`.
    """
    return await self.hybrid_fts_search(*args, **kwargs)

Find rows whose string fields match a regular expression.

Uses DuckDB's regexp_matches under the hood. DuckDB ships RE2 (Google's regex library) so evaluation is linear-time — no catastrophic-backtracking exposure even if pattern comes from an untrusted source.

Parameters:

Name Type Description Default
pattern str

The regex pattern (RE2 syntax).

required
table_name str

Target table.

required
fields Optional[List[str]]

Field names to match against. Defaults to every string-typed field on the schema. Names not present as string columns are silently dropped.

None
case_sensitive bool

When False, matches case-insensitively.

True
k int

Maximum number of rows returned.

10
output_format str

"json" (list of dicts, default) or "csv" (CSV string).

'json'
Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def regex_search(
    self,
    pattern: str,
    *,
    table_name: str,
    fields: Optional[List[str]] = None,
    case_sensitive: bool = True,
    k: int = 10,
    output_format: str = "json",
):
    """Find rows whose string fields match a regular expression.

    Uses DuckDB's ``regexp_matches`` under the hood. DuckDB ships
    RE2 (Google's regex library) so evaluation is linear-time — no
    catastrophic-backtracking exposure even if ``pattern`` comes
    from an untrusted source.

    Args:
        pattern: The regex pattern (RE2 syntax).
        table_name: Target table.
        fields: Field names to match against. Defaults to every
            string-typed field on the schema. Names not present
            as string columns are silently dropped.
        case_sensitive: When ``False``, matches case-insensitively.
        k: Maximum number of rows returned.
        output_format: ``"json"`` (list of dicts, default) or
            ``"csv"`` (CSV string).
    """
    if not pattern:
        return format_search_results([], output_format)

    label = table_identifier(table_name)
    schema = self._duckdb_table_to_json_schema(label)
    properties = schema.get("properties", {})

    string_cols = [
        column_identifier(name)
        for name, info in properties.items()
        if info.get("type") == "string"
    ]
    if fields is not None:
        requested = {column_identifier(f) for f in fields}
        cols = [c for c in string_cols if c in requested]
    else:
        cols = string_cols
    if not cols:
        warnings.warn(
            f"Skipping regex search for {label}: no matching string fields."
        )
        return format_search_results([], output_format)

    flag = "i" if not case_sensitive else ""
    where = " OR ".join(f"regexp_matches({c}, ?, ?)" for c in cols)
    sql = f"SELECT * FROM {label} WHERE {where} LIMIT ?;"
    params = []
    for _ in cols:
        params.extend([pattern, flag])
    params.append(k)

    with self._connect(read_only=True) as con:
        try:
            arrow_table = con.execute(sql, params).arrow().read_all()
        except Exception as e:
            raise RuntimeError(f"Regex query failed for table '{label}': {e}")
    return format_search_results(arrow_table, output_format)

rename(source, *, table_name=None, table_description=None) async

Rename a table and/or update its schema description.

At least one of table_name / table_description must be given. When table_name changes, the FTS and HNSW indexes are dropped and rebuilt under the new name, and the adapter's known-tables registry is updated so subsequent searches find the table at its new name.

Parameters:

Name Type Description Default
source Union[Any, str]

SymbolicDataModel for the table to rename, or its name as a string. String form is PascalCase- normalized so callers can pass the original filename- style input they used in from_csv.

required
table_name Optional[str]

New table name. Optional. Always normalized to PascalCase.

None
table_description Optional[str]

New schema description. Optional. Lives in the SymbolicDataModel layer (DuckDB doesn't carry per-table descriptions natively).

None

Returns:

Type Description
SymbolicDataModel

A fresh SymbolicDataModel for the (possibly

SymbolicDataModel

renamed) table, reflecting the post-rename column shape

SymbolicDataModel

and the supplied description.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def rename(
    self,
    source: Union[Any, str],
    *,
    table_name: Optional[str] = None,
    table_description: Optional[str] = None,
) -> SymbolicDataModel:
    """Rename a table and/or update its schema description.

    At least one of ``table_name`` / ``table_description`` must be
    given. When ``table_name`` changes, the FTS and HNSW indexes
    are dropped and rebuilt under the new name, and the adapter's
    known-tables registry is updated so subsequent searches find
    the table at its new name.

    Args:
        source: ``SymbolicDataModel`` for the table to rename, or
            its name as a string. String form is PascalCase-
            normalized so callers can pass the original filename-
            style input they used in `from_csv`.
        table_name: New table name. Optional. Always normalized to
            PascalCase.
        table_description: New schema description. Optional. Lives
            in the ``SymbolicDataModel`` layer (DuckDB doesn't
            carry per-table descriptions natively).

    Returns:
        A fresh `SymbolicDataModel` for the (possibly
        renamed) table, reflecting the post-rename column shape
        and the supplied description.
    """
    if table_name is None and table_description is None:
        raise ValueError(
            "rename(): pass at least one of `table_name=` or `table_description=`."
        )

    if isinstance(source, str):
        raw_old = source
    else:
        raw_old = source.get_schema().get("title")
        if not raw_old:
            raise ValueError(
                "rename(): source SymbolicDataModel has no schema "
                "title; cannot determine the table to rename."
            )
    old_name = table_identifier(raw_old)

    with self._connect(read_only=True) as con:
        exists = con.execute(
            f"SELECT COUNT(*) FROM information_schema.tables "
            f"WHERE table_schema='{MAIN_TABLE}' AND table_name=?",
            [old_name],
        ).fetchone()[0]
    if not exists:
        raise ValueError(
            f"rename(): no table named {old_name!r} found in the knowledge base."
        )

    new_name = old_name
    if table_name is not None:
        new_name = table_identifier(table_name)

        if new_name != old_name:
            # FTS/HNSW indexes are name-bound; drop then rebuild.
            with self._connect(read_only=False) as con:
                try:
                    con.execute(f"PRAGMA drop_fts_index('main.{old_name}');")
                except duckdb.Error:
                    pass

                old_vector_index = f"vector_main_{old_name}"
                con.execute(f"DROP INDEX IF EXISTS {old_vector_index};")

                con.execute(f"ALTER TABLE {old_name} RENAME TO {new_name};")

    schema = self._duckdb_table_to_json_schema(new_name)
    schema["title"] = new_name
    if table_description is not None:
        schema["description"] = table_description
    else:
        if not isinstance(source, str):
            old_schema = source.get_schema()
            if "description" in old_schema:
                schema["description"] = old_schema["description"]
    renamed_model = SymbolicDataModel(schema=schema, name=new_name)

    self.data_models.pop(old_name, None)
    self.data_models[new_name] = renamed_model

    try:
        self._maybe_create_fulltext_index(renamed_model)
    except Exception as e:
        warnings.warn(
            f"FTS index rebuild failed for '{new_name}'; "
            f"fulltext_search results may be stale. ({e})"
        )
    try:
        self._maybe_create_vector_index(renamed_model)
    except Exception as e:
        warnings.warn(
            f"Vector index rebuild failed for '{new_name}'; "
            f"similarity_search will fall back to scan. ({e})"
        )

    return renamed_model

Vector similarity search against a single table.

Parameters:

Name Type Description Default
text_or_texts Union[str, List[str]]

Query text, or list of query texts. Multiple queries are merged into a single ranked result set (best score per id kept).

required
table_name str

Target table.

required
k int

Maximum number of rows returned.

10
threshold Optional[float]

Optional maximum vector distance — rows beyond this distance are dropped.

None
ef_search Optional[int]

Optional override for HNSW's search-time candidate-list depth. None keeps the index-time value (or DuckDB's default of 64); higher = better recall at slower query time. Mirrors Ladybug's ef_search.

None
output_format str

"json" (list of dicts, default — Python data) or "csv" (CSV string, more compact for LM input).

'json'
Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def similarity_search(
    self,
    text_or_texts: Union[str, List[str]],
    *,
    table_name: str,
    k: int = 10,
    threshold: Optional[float] = None,
    ef_search: Optional[int] = None,
    output_format: str = "json",
):
    """Vector similarity search against a single table.

    Args:
        text_or_texts: Query text, or list of query texts. Multiple
            queries are merged into a single ranked result set
            (best score per id kept).
        table_name: Target table.
        k: Maximum number of rows returned.
        threshold: Optional maximum vector distance — rows beyond
            this distance are dropped.
        ef_search: Optional override for HNSW's search-time
            candidate-list depth. ``None`` keeps the index-time
            value (or DuckDB's default of 64); higher = better
            recall at slower query time. Mirrors Ladybug's
            ``ef_search``.
        output_format: ``"json"`` (list of dicts, default — Python
            data) or ``"csv"`` (CSV string, more compact for LM
            input).
    """
    if not text_or_texts:
        return format_search_results([], output_format)
    if not self.embedding_model:
        raise ValueError(
            "similarity_search requires an embedding model on the adapter."
        )

    texts = [text_or_texts] if not isinstance(text_or_texts, list) else text_or_texts

    label = table_identifier(table_name)
    schema = self._duckdb_table_to_json_schema(label)
    id_key = self._get_id_key(schema)

    embeddings = await self.embedding_model(EmbeddingRequest(texts=texts))
    vectors = embeddings.get("embeddings")
    if vectors:
        await self._ensure_vector_dim(vectors[0])
    dist_fn = _VSS_DISTANCE_FN[self.metric]
    # Canonical score, shared with LanceDBAdapter. ``array_distance`` is the
    # (non-squared) Euclidean distance, so square it for "l2sq" to match
    # LanceDB's squared-L2; "cosine"/"ip" use the raw function. ``threshold``
    # is compared in the same units as the score (one ``?`` per occurrence).
    _d = f"{dist_fn}({self.vss_key}, ?::FLOAT[{self.vector_dim}])"
    score_expr = f"power({_d}, 2)" if self.metric == "l2sq" else _d

    if len(vectors) == 1:
        vector = vectors[0]
        where_clause = f"WHERE {score_expr} < ?" if threshold else ""
        sql = f"""
            SELECT *,
                {score_expr} AS score
            FROM {label}
            {where_clause}
            ORDER BY score ASC
            LIMIT ?;
        """
        params = [vector]
        if threshold is not None:
            params.extend([vector, threshold])
        params.append(k)

        with (
            self._connect(read_only=True) as con,
            self._hnsw_ef_search_override(con, ef_search),
        ):
            try:
                arrow_table = con.execute(sql, params).arrow().read_all()
            except Exception as e:
                raise RuntimeError(f"Vector search failed for table '{label}': {e}")
        return format_search_results(arrow_table, output_format)

    # Multi-query: dedupe by id, keep best score, take top-k.
    results: Dict[Any, Dict[str, Any]] = {}
    with (
        self._connect(read_only=True) as con,
        self._hnsw_ef_search_override(con, ef_search),
    ):
        for vector in vectors:
            where_clause = f"WHERE {score_expr} < ?" if threshold else ""
            sql = f"""
                SELECT *,
                    {score_expr} AS score
                FROM {label}
                {where_clause}
                ORDER BY score ASC
                LIMIT ?;
            """
            params = [vector]
            if threshold is not None:
                params.extend([vector, threshold])
            params.append(k)

            try:
                rows = con.execute(sql, params).arrow().read_all().to_pylist()
            except Exception as e:
                raise RuntimeError(f"Vector search failed for table '{label}': {e}")

            for row in rows:
                uid = row[id_key]
                prev = results.get(uid)
                if prev is None or row["score"] < prev["score"]:
                    results[uid] = row

    ranked = sorted(results.values(), key=lambda r: r["score"])[:k]
    return format_search_results(ranked, output_format)

sql(sql, *, params=None, read_only=True, output_format='json', **kwargs) async

Execute a raw SQL query against the database.

Parameters:

Name Type Description Default
sql str

The SQL string.

required
params Optional[List[Any]]

Optional positional parameters for the query.

None
read_only bool

When True (default), enforces SELECT-only at the parser layer to reject multi-statement injection, COPY ... TO '/path' filesystem writes, and other non-SELECT statements. Set False for trusted mutations.

True
output_format str

"json" (list of dicts, default) or "csv" (CSV string).

'json'
Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def sql(
    self,
    sql: str,
    *,
    params: Optional[List[Any]] = None,
    read_only: bool = True,
    output_format: str = "json",
    **kwargs,
):
    """Execute a raw SQL query against the database.

    Args:
        sql: The SQL string.
        params: Optional positional parameters for the query.
        read_only: When ``True`` (default), enforces SELECT-only at
            the parser layer to reject multi-statement injection,
            ``COPY ... TO '/path'`` filesystem writes, and other
            non-SELECT statements. Set ``False`` for trusted
            mutations.
        output_format: ``"json"`` (list of dicts, default) or
            ``"csv"`` (CSV string).
    """
    # read_only enforces SELECT-only at the parser. A connection-level
    # RO flag alone would let `COPY (...) TO '/path'` through.
    if read_only:
        try:
            statements = duckdb.extract_statements(sql)
        except duckdb.Error as e:
            raise duckdb.InvalidInputException(f"Invalid SQL: {e}") from e
        if not statements:
            raise duckdb.InvalidInputException("Empty SQL query.")
        for stmt in statements:
            if stmt.type != duckdb.StatementType.SELECT:
                raise duckdb.InvalidInputException(
                    f"read_only=True only permits SELECT statements; "
                    f"got {stmt.type.name}."
                )
    with self._connect(read_only=read_only) as con:
        arrow_table = con.execute(sql, params or []).arrow().read_all()
    return format_search_results(arrow_table, output_format)

update(data_model_or_data_models) async

Update or insert records. Returns the primary key value(s).

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
async def update(
    self,
    data_model_or_data_models: Union[List[JsonDataModel], JsonDataModel],
) -> Union[Any, List[Any]]:
    """Update or insert records. Returns the primary key value(s)."""
    if not isinstance(data_model_or_data_models, list):
        data_models = [data_model_or_data_models]
        return_single = True
    else:
        data_models = data_model_or_data_models
        return_single = False

    # Bucket by (table, column-shape) so each bucket uses one executemany.
    ids: List[Any] = []
    buckets: Dict[tuple, Dict[str, Any]] = {}
    tables_seen: List[Any] = []
    tables_seen_set: set = set()

    for data_model in data_models:
        if not isinstance(data_model, JsonDataModel):
            data_model = data_model.to_json_data_model()

        schema = data_model.get_schema()
        table = table_identifier(schema["title"])
        json_data = sanitize_properties(data_model.get_json())
        id_key = self._get_id_key(schema)

        id_val = json_data.get(id_key)
        if id_val is None:
            raise ValueError(f"Primary key '{id_key}' is required but not provided")

        if table not in tables_seen_set:
            # Learn the dimension from this record's embedding (or an
            # on-loop probe) before the FLOAT[dim] column is created.
            await self._ensure_vector_dim(json_data.get(self.vss_key))
            self._maybe_create_table(data_model)
            tables_seen.append(data_model)
            tables_seen_set.add(table)

        cols = tuple(json_data.keys())
        bucket = buckets.setdefault(
            (table, cols),
            {"id_key": id_key, "cols": cols, "params": []},
        )
        bucket["params"].append([json_data[c] for c in cols])
        ids.append(id_val)

    with self._connect(read_only=False) as con:
        con.execute("BEGIN TRANSACTION;")
        try:
            for (table, _cols), bucket in buckets.items():
                cols = bucket["cols"]
                id_key = bucket["id_key"]
                col_sql = ", ".join(cols)
                placeholders = ", ".join(["?"] * len(cols))
                update_cols = [c for c in cols if c != id_key]

                if update_cols:
                    update_sql = ", ".join(f"{c} = EXCLUDED.{c}" for c in update_cols)
                    conflict_clause = (
                        f"ON CONFLICT ({id_key}) DO UPDATE SET {update_sql}"
                    )
                else:
                    conflict_clause = f"ON CONFLICT ({id_key}) DO NOTHING"

                sql = (
                    f"INSERT INTO {table} ({col_sql}) "
                    f"VALUES ({placeholders}) "
                    f"{conflict_clause};"
                )

                con.executemany(sql, bucket["params"])
        except Exception:
            con.execute("ROLLBACK;")
            raise
        con.execute("COMMIT;")

    # FTS/HNSW rebuilds are best-effort — data is already committed.
    for data_model in tables_seen:
        try:
            self._maybe_create_fulltext_index(data_model)
        except Exception as e:
            table = table_identifier(data_model.get_schema()["title"])
            warnings.warn(
                f"FTS index rebuild failed for '{table}'; "
                f"fulltext_search results may be stale. ({e})"
            )

    for data_model in tables_seen:
        try:
            self._maybe_create_vector_index(data_model)
        except Exception as e:
            table = table_identifier(data_model.get_schema()["title"])
            warnings.warn(
                f"Vector index rebuild failed for '{table}'; "
                f"similarity_search will fall back to scan. ({e})"
            )

    return ids[0] if return_single else ids

wipe_database()

Drop every table in the main schema, clearing all data.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
def wipe_database(self):
    """Drop every table in the ``main`` schema, clearing all data."""
    with self._connect(read_only=False) as con:
        tables = con.execute(f"""
            SELECT table_name 
            FROM information_schema.tables 
            WHERE table_schema='{MAIN_TABLE}';
        """).fetchall()

        for (table_name,) in tables:
            try:
                con.execute(f"DROP TABLE IF EXISTS {table_name}")
            except Exception as e:
                raise RuntimeError(f"Failed to drop table {table_name}: {e}")

sanitize_properties(properties)

Sanitize and snake_case-normalize dict keys before SQL interpolation.

Source code in synalinks/src/knowledge_bases/database_adapters/duckdb_adapter.py
def sanitize_properties(properties: dict):
    """Sanitize and snake_case-normalize dict keys before SQL interpolation."""
    return {column_identifier(k): v for k, v in properties.items()}